migration: Create migration/xbzrle.h
[qemu/ar7.git] / migration / ram.c
blobc14269fd16858e2a65bacbe4de547453dba17945
1 /*
2 * QEMU System Emulator
4 * Copyright (c) 2003-2008 Fabrice Bellard
5 * Copyright (c) 2011-2015 Red Hat Inc
7 * Authors:
8 * Juan Quintela <quintela@redhat.com>
10 * Permission is hereby granted, free of charge, to any person obtaining a copy
11 * of this software and associated documentation files (the "Software"), to deal
12 * in the Software without restriction, including without limitation the rights
13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 * copies of the Software, and to permit persons to whom the Software is
15 * furnished to do so, subject to the following conditions:
17 * The above copyright notice and this permission notice shall be included in
18 * all copies or substantial portions of the Software.
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26 * THE SOFTWARE.
28 #include "qemu/osdep.h"
29 #include "qemu-common.h"
30 #include "cpu.h"
31 #include <zlib.h>
32 #include "qapi-event.h"
33 #include "qemu/cutils.h"
34 #include "qemu/bitops.h"
35 #include "qemu/bitmap.h"
36 #include "qemu/timer.h"
37 #include "qemu/main-loop.h"
38 #include "xbzrle.h"
39 #include "migration/migration.h"
40 #include "postcopy-ram.h"
41 #include "exec/address-spaces.h"
42 #include "migration/page_cache.h"
43 #include "qemu/error-report.h"
44 #include "trace.h"
45 #include "exec/ram_addr.h"
46 #include "qemu/rcu_queue.h"
47 #include "migration/colo.h"
49 /***********************************************************/
50 /* ram save/restore */
52 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
53 * worked for pages that where filled with the same char. We switched
54 * it to only search for the zero value. And to avoid confusion with
55 * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
58 #define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */
59 #define RAM_SAVE_FLAG_ZERO 0x02
60 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
61 #define RAM_SAVE_FLAG_PAGE 0x08
62 #define RAM_SAVE_FLAG_EOS 0x10
63 #define RAM_SAVE_FLAG_CONTINUE 0x20
64 #define RAM_SAVE_FLAG_XBZRLE 0x40
65 /* 0x80 is reserved in migration.h start with 0x100 next */
66 #define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100
68 static uint8_t *ZERO_TARGET_PAGE;
70 static inline bool is_zero_range(uint8_t *p, uint64_t size)
72 return buffer_is_zero(p, size);
75 /* struct contains XBZRLE cache and a static page
76 used by the compression */
77 static struct {
78 /* buffer used for XBZRLE encoding */
79 uint8_t *encoded_buf;
80 /* buffer for storing page content */
81 uint8_t *current_buf;
82 /* Cache for XBZRLE, Protected by lock. */
83 PageCache *cache;
84 QemuMutex lock;
85 } XBZRLE;
87 /* buffer used for XBZRLE decoding */
88 static uint8_t *xbzrle_decoded_buf;
90 static void XBZRLE_cache_lock(void)
92 if (migrate_use_xbzrle())
93 qemu_mutex_lock(&XBZRLE.lock);
96 static void XBZRLE_cache_unlock(void)
98 if (migrate_use_xbzrle())
99 qemu_mutex_unlock(&XBZRLE.lock);
103 * xbzrle_cache_resize: resize the xbzrle cache
105 * This function is called from qmp_migrate_set_cache_size in main
106 * thread, possibly while a migration is in progress. A running
107 * migration may be using the cache and might finish during this call,
108 * hence changes to the cache are protected by XBZRLE.lock().
110 * Returns the new_size or negative in case of error.
112 * @new_size: new cache size
114 int64_t xbzrle_cache_resize(int64_t new_size)
116 PageCache *new_cache;
117 int64_t ret;
119 if (new_size < TARGET_PAGE_SIZE) {
120 return -1;
123 XBZRLE_cache_lock();
125 if (XBZRLE.cache != NULL) {
126 if (pow2floor(new_size) == migrate_xbzrle_cache_size()) {
127 goto out_new_size;
129 new_cache = cache_init(new_size / TARGET_PAGE_SIZE,
130 TARGET_PAGE_SIZE);
131 if (!new_cache) {
132 error_report("Error creating cache");
133 ret = -1;
134 goto out;
137 cache_fini(XBZRLE.cache);
138 XBZRLE.cache = new_cache;
141 out_new_size:
142 ret = pow2floor(new_size);
143 out:
144 XBZRLE_cache_unlock();
145 return ret;
149 * An outstanding page request, on the source, having been received
150 * and queued
152 struct RAMSrcPageRequest {
153 RAMBlock *rb;
154 hwaddr offset;
155 hwaddr len;
157 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
160 /* State of RAM for migration */
161 struct RAMState {
162 /* QEMUFile used for this migration */
163 QEMUFile *f;
164 /* Last block that we have visited searching for dirty pages */
165 RAMBlock *last_seen_block;
166 /* Last block from where we have sent data */
167 RAMBlock *last_sent_block;
168 /* Last dirty target page we have sent */
169 ram_addr_t last_page;
170 /* last ram version we have seen */
171 uint32_t last_version;
172 /* We are in the first round */
173 bool ram_bulk_stage;
174 /* How many times we have dirty too many pages */
175 int dirty_rate_high_cnt;
176 /* How many times we have synchronized the bitmap */
177 uint64_t bitmap_sync_count;
178 /* these variables are used for bitmap sync */
179 /* last time we did a full bitmap_sync */
180 int64_t time_last_bitmap_sync;
181 /* bytes transferred at start_time */
182 uint64_t bytes_xfer_prev;
183 /* number of dirty pages since start_time */
184 uint64_t num_dirty_pages_period;
185 /* xbzrle misses since the beginning of the period */
186 uint64_t xbzrle_cache_miss_prev;
187 /* number of iterations at the beginning of period */
188 uint64_t iterations_prev;
189 /* Accounting fields */
190 /* number of zero pages. It used to be pages filled by the same char. */
191 uint64_t zero_pages;
192 /* number of normal transferred pages */
193 uint64_t norm_pages;
194 /* Iterations since start */
195 uint64_t iterations;
196 /* xbzrle transmitted bytes. Notice that this is with
197 * compression, they can't be calculated from the pages */
198 uint64_t xbzrle_bytes;
199 /* xbzrle transmmited pages */
200 uint64_t xbzrle_pages;
201 /* xbzrle number of cache miss */
202 uint64_t xbzrle_cache_miss;
203 /* xbzrle miss rate */
204 double xbzrle_cache_miss_rate;
205 /* xbzrle number of overflows */
206 uint64_t xbzrle_overflows;
207 /* number of dirty bits in the bitmap */
208 uint64_t migration_dirty_pages;
209 /* total number of bytes transferred */
210 uint64_t bytes_transferred;
211 /* number of dirtied pages in the last second */
212 uint64_t dirty_pages_rate;
213 /* Count of requests incoming from destination */
214 uint64_t postcopy_requests;
215 /* protects modification of the bitmap */
216 QemuMutex bitmap_mutex;
217 /* The RAMBlock used in the last src_page_requests */
218 RAMBlock *last_req_rb;
219 /* Queue of outstanding page requests from the destination */
220 QemuMutex src_page_req_mutex;
221 QSIMPLEQ_HEAD(src_page_requests, RAMSrcPageRequest) src_page_requests;
223 typedef struct RAMState RAMState;
225 static RAMState ram_state;
227 uint64_t dup_mig_pages_transferred(void)
229 return ram_state.zero_pages;
232 uint64_t norm_mig_pages_transferred(void)
234 return ram_state.norm_pages;
237 uint64_t xbzrle_mig_bytes_transferred(void)
239 return ram_state.xbzrle_bytes;
242 uint64_t xbzrle_mig_pages_transferred(void)
244 return ram_state.xbzrle_pages;
247 uint64_t xbzrle_mig_pages_cache_miss(void)
249 return ram_state.xbzrle_cache_miss;
252 double xbzrle_mig_cache_miss_rate(void)
254 return ram_state.xbzrle_cache_miss_rate;
257 uint64_t xbzrle_mig_pages_overflow(void)
259 return ram_state.xbzrle_overflows;
262 uint64_t ram_bytes_transferred(void)
264 return ram_state.bytes_transferred;
267 uint64_t ram_bytes_remaining(void)
269 return ram_state.migration_dirty_pages * TARGET_PAGE_SIZE;
272 uint64_t ram_dirty_sync_count(void)
274 return ram_state.bitmap_sync_count;
277 uint64_t ram_dirty_pages_rate(void)
279 return ram_state.dirty_pages_rate;
282 uint64_t ram_postcopy_requests(void)
284 return ram_state.postcopy_requests;
287 /* used by the search for pages to send */
288 struct PageSearchStatus {
289 /* Current block being searched */
290 RAMBlock *block;
291 /* Current page to search from */
292 unsigned long page;
293 /* Set once we wrap around */
294 bool complete_round;
296 typedef struct PageSearchStatus PageSearchStatus;
298 struct CompressParam {
299 bool done;
300 bool quit;
301 QEMUFile *file;
302 QemuMutex mutex;
303 QemuCond cond;
304 RAMBlock *block;
305 ram_addr_t offset;
307 typedef struct CompressParam CompressParam;
309 struct DecompressParam {
310 bool done;
311 bool quit;
312 QemuMutex mutex;
313 QemuCond cond;
314 void *des;
315 uint8_t *compbuf;
316 int len;
318 typedef struct DecompressParam DecompressParam;
320 static CompressParam *comp_param;
321 static QemuThread *compress_threads;
322 /* comp_done_cond is used to wake up the migration thread when
323 * one of the compression threads has finished the compression.
324 * comp_done_lock is used to co-work with comp_done_cond.
326 static QemuMutex comp_done_lock;
327 static QemuCond comp_done_cond;
328 /* The empty QEMUFileOps will be used by file in CompressParam */
329 static const QEMUFileOps empty_ops = { };
331 static DecompressParam *decomp_param;
332 static QemuThread *decompress_threads;
333 static QemuMutex decomp_done_lock;
334 static QemuCond decomp_done_cond;
336 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
337 ram_addr_t offset);
339 static void *do_data_compress(void *opaque)
341 CompressParam *param = opaque;
342 RAMBlock *block;
343 ram_addr_t offset;
345 qemu_mutex_lock(&param->mutex);
346 while (!param->quit) {
347 if (param->block) {
348 block = param->block;
349 offset = param->offset;
350 param->block = NULL;
351 qemu_mutex_unlock(&param->mutex);
353 do_compress_ram_page(param->file, block, offset);
355 qemu_mutex_lock(&comp_done_lock);
356 param->done = true;
357 qemu_cond_signal(&comp_done_cond);
358 qemu_mutex_unlock(&comp_done_lock);
360 qemu_mutex_lock(&param->mutex);
361 } else {
362 qemu_cond_wait(&param->cond, &param->mutex);
365 qemu_mutex_unlock(&param->mutex);
367 return NULL;
370 static inline void terminate_compression_threads(void)
372 int idx, thread_count;
374 thread_count = migrate_compress_threads();
376 for (idx = 0; idx < thread_count; idx++) {
377 qemu_mutex_lock(&comp_param[idx].mutex);
378 comp_param[idx].quit = true;
379 qemu_cond_signal(&comp_param[idx].cond);
380 qemu_mutex_unlock(&comp_param[idx].mutex);
384 void migrate_compress_threads_join(void)
386 int i, thread_count;
388 if (!migrate_use_compression()) {
389 return;
391 terminate_compression_threads();
392 thread_count = migrate_compress_threads();
393 for (i = 0; i < thread_count; i++) {
394 qemu_thread_join(compress_threads + i);
395 qemu_fclose(comp_param[i].file);
396 qemu_mutex_destroy(&comp_param[i].mutex);
397 qemu_cond_destroy(&comp_param[i].cond);
399 qemu_mutex_destroy(&comp_done_lock);
400 qemu_cond_destroy(&comp_done_cond);
401 g_free(compress_threads);
402 g_free(comp_param);
403 compress_threads = NULL;
404 comp_param = NULL;
407 void migrate_compress_threads_create(void)
409 int i, thread_count;
411 if (!migrate_use_compression()) {
412 return;
414 thread_count = migrate_compress_threads();
415 compress_threads = g_new0(QemuThread, thread_count);
416 comp_param = g_new0(CompressParam, thread_count);
417 qemu_cond_init(&comp_done_cond);
418 qemu_mutex_init(&comp_done_lock);
419 for (i = 0; i < thread_count; i++) {
420 /* comp_param[i].file is just used as a dummy buffer to save data,
421 * set its ops to empty.
423 comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
424 comp_param[i].done = true;
425 comp_param[i].quit = false;
426 qemu_mutex_init(&comp_param[i].mutex);
427 qemu_cond_init(&comp_param[i].cond);
428 qemu_thread_create(compress_threads + i, "compress",
429 do_data_compress, comp_param + i,
430 QEMU_THREAD_JOINABLE);
435 * save_page_header: write page header to wire
437 * If this is the 1st block, it also writes the block identification
439 * Returns the number of bytes written
441 * @f: QEMUFile where to send the data
442 * @block: block that contains the page we want to send
443 * @offset: offset inside the block for the page
444 * in the lower bits, it contains flags
446 static size_t save_page_header(RAMState *rs, QEMUFile *f, RAMBlock *block,
447 ram_addr_t offset)
449 size_t size, len;
451 if (block == rs->last_sent_block) {
452 offset |= RAM_SAVE_FLAG_CONTINUE;
454 qemu_put_be64(f, offset);
455 size = 8;
457 if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
458 len = strlen(block->idstr);
459 qemu_put_byte(f, len);
460 qemu_put_buffer(f, (uint8_t *)block->idstr, len);
461 size += 1 + len;
462 rs->last_sent_block = block;
464 return size;
468 * mig_throttle_guest_down: throotle down the guest
470 * Reduce amount of guest cpu execution to hopefully slow down memory
471 * writes. If guest dirty memory rate is reduced below the rate at
472 * which we can transfer pages to the destination then we should be
473 * able to complete migration. Some workloads dirty memory way too
474 * fast and will not effectively converge, even with auto-converge.
476 static void mig_throttle_guest_down(void)
478 MigrationState *s = migrate_get_current();
479 uint64_t pct_initial = s->parameters.cpu_throttle_initial;
480 uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
482 /* We have not started throttling yet. Let's start it. */
483 if (!cpu_throttle_active()) {
484 cpu_throttle_set(pct_initial);
485 } else {
486 /* Throttling already on, just increase the rate */
487 cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement);
492 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
494 * @rs: current RAM state
495 * @current_addr: address for the zero page
497 * Update the xbzrle cache to reflect a page that's been sent as all 0.
498 * The important thing is that a stale (not-yet-0'd) page be replaced
499 * by the new data.
500 * As a bonus, if the page wasn't in the cache it gets added so that
501 * when a small write is made into the 0'd page it gets XBZRLE sent.
503 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
505 if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
506 return;
509 /* We don't care if this fails to allocate a new cache page
510 * as long as it updated an old one */
511 cache_insert(XBZRLE.cache, current_addr, ZERO_TARGET_PAGE,
512 rs->bitmap_sync_count);
515 #define ENCODING_FLAG_XBZRLE 0x1
518 * save_xbzrle_page: compress and send current page
520 * Returns: 1 means that we wrote the page
521 * 0 means that page is identical to the one already sent
522 * -1 means that xbzrle would be longer than normal
524 * @rs: current RAM state
525 * @current_data: pointer to the address of the page contents
526 * @current_addr: addr of the page
527 * @block: block that contains the page we want to send
528 * @offset: offset inside the block for the page
529 * @last_stage: if we are at the completion stage
531 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
532 ram_addr_t current_addr, RAMBlock *block,
533 ram_addr_t offset, bool last_stage)
535 int encoded_len = 0, bytes_xbzrle;
536 uint8_t *prev_cached_page;
538 if (!cache_is_cached(XBZRLE.cache, current_addr, rs->bitmap_sync_count)) {
539 rs->xbzrle_cache_miss++;
540 if (!last_stage) {
541 if (cache_insert(XBZRLE.cache, current_addr, *current_data,
542 rs->bitmap_sync_count) == -1) {
543 return -1;
544 } else {
545 /* update *current_data when the page has been
546 inserted into cache */
547 *current_data = get_cached_data(XBZRLE.cache, current_addr);
550 return -1;
553 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
555 /* save current buffer into memory */
556 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
558 /* XBZRLE encoding (if there is no overflow) */
559 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
560 TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
561 TARGET_PAGE_SIZE);
562 if (encoded_len == 0) {
563 trace_save_xbzrle_page_skipping();
564 return 0;
565 } else if (encoded_len == -1) {
566 trace_save_xbzrle_page_overflow();
567 rs->xbzrle_overflows++;
568 /* update data in the cache */
569 if (!last_stage) {
570 memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
571 *current_data = prev_cached_page;
573 return -1;
576 /* we need to update the data in the cache, in order to get the same data */
577 if (!last_stage) {
578 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
581 /* Send XBZRLE based compressed page */
582 bytes_xbzrle = save_page_header(rs, rs->f, block,
583 offset | RAM_SAVE_FLAG_XBZRLE);
584 qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
585 qemu_put_be16(rs->f, encoded_len);
586 qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
587 bytes_xbzrle += encoded_len + 1 + 2;
588 rs->xbzrle_pages++;
589 rs->xbzrle_bytes += bytes_xbzrle;
590 rs->bytes_transferred += bytes_xbzrle;
592 return 1;
596 * migration_bitmap_find_dirty: find the next dirty page from start
598 * Called with rcu_read_lock() to protect migration_bitmap
600 * Returns the byte offset within memory region of the start of a dirty page
602 * @rs: current RAM state
603 * @rb: RAMBlock where to search for dirty pages
604 * @start: page where we start the search
606 static inline
607 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
608 unsigned long start)
610 unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
611 unsigned long *bitmap = rb->bmap;
612 unsigned long next;
614 if (rs->ram_bulk_stage && start > 0) {
615 next = start + 1;
616 } else {
617 next = find_next_bit(bitmap, size, start);
620 return next;
623 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
624 RAMBlock *rb,
625 unsigned long page)
627 bool ret;
629 ret = test_and_clear_bit(page, rb->bmap);
631 if (ret) {
632 rs->migration_dirty_pages--;
634 return ret;
637 static void migration_bitmap_sync_range(RAMState *rs, RAMBlock *rb,
638 ram_addr_t start, ram_addr_t length)
640 rs->migration_dirty_pages +=
641 cpu_physical_memory_sync_dirty_bitmap(rb, start, length,
642 &rs->num_dirty_pages_period);
646 * ram_pagesize_summary: calculate all the pagesizes of a VM
648 * Returns a summary bitmap of the page sizes of all RAMBlocks
650 * For VMs with just normal pages this is equivalent to the host page
651 * size. If it's got some huge pages then it's the OR of all the
652 * different page sizes.
654 uint64_t ram_pagesize_summary(void)
656 RAMBlock *block;
657 uint64_t summary = 0;
659 RAMBLOCK_FOREACH(block) {
660 summary |= block->page_size;
663 return summary;
666 static void migration_bitmap_sync(RAMState *rs)
668 RAMBlock *block;
669 int64_t end_time;
670 uint64_t bytes_xfer_now;
672 rs->bitmap_sync_count++;
674 if (!rs->bytes_xfer_prev) {
675 rs->bytes_xfer_prev = ram_bytes_transferred();
678 if (!rs->time_last_bitmap_sync) {
679 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
682 trace_migration_bitmap_sync_start();
683 memory_global_dirty_log_sync();
685 qemu_mutex_lock(&rs->bitmap_mutex);
686 rcu_read_lock();
687 RAMBLOCK_FOREACH(block) {
688 migration_bitmap_sync_range(rs, block, 0, block->used_length);
690 rcu_read_unlock();
691 qemu_mutex_unlock(&rs->bitmap_mutex);
693 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
695 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
697 /* more than 1 second = 1000 millisecons */
698 if (end_time > rs->time_last_bitmap_sync + 1000) {
699 if (migrate_auto_converge()) {
700 /* The following detection logic can be refined later. For now:
701 Check to see if the dirtied bytes is 50% more than the approx.
702 amount of bytes that just got transferred since the last time we
703 were in this routine. If that happens twice, start or increase
704 throttling */
705 bytes_xfer_now = ram_bytes_transferred();
707 if (rs->dirty_pages_rate &&
708 (rs->num_dirty_pages_period * TARGET_PAGE_SIZE >
709 (bytes_xfer_now - rs->bytes_xfer_prev) / 2) &&
710 (rs->dirty_rate_high_cnt++ >= 2)) {
711 trace_migration_throttle();
712 rs->dirty_rate_high_cnt = 0;
713 mig_throttle_guest_down();
715 rs->bytes_xfer_prev = bytes_xfer_now;
718 if (migrate_use_xbzrle()) {
719 if (rs->iterations_prev != rs->iterations) {
720 rs->xbzrle_cache_miss_rate =
721 (double)(rs->xbzrle_cache_miss -
722 rs->xbzrle_cache_miss_prev) /
723 (rs->iterations - rs->iterations_prev);
725 rs->iterations_prev = rs->iterations;
726 rs->xbzrle_cache_miss_prev = rs->xbzrle_cache_miss;
728 rs->dirty_pages_rate = rs->num_dirty_pages_period * 1000
729 / (end_time - rs->time_last_bitmap_sync);
730 rs->time_last_bitmap_sync = end_time;
731 rs->num_dirty_pages_period = 0;
733 if (migrate_use_events()) {
734 qapi_event_send_migration_pass(rs->bitmap_sync_count, NULL);
739 * save_zero_page: send the zero page to the stream
741 * Returns the number of pages written.
743 * @rs: current RAM state
744 * @block: block that contains the page we want to send
745 * @offset: offset inside the block for the page
746 * @p: pointer to the page
748 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
749 uint8_t *p)
751 int pages = -1;
753 if (is_zero_range(p, TARGET_PAGE_SIZE)) {
754 rs->zero_pages++;
755 rs->bytes_transferred +=
756 save_page_header(rs, rs->f, block, offset | RAM_SAVE_FLAG_ZERO);
757 qemu_put_byte(rs->f, 0);
758 rs->bytes_transferred += 1;
759 pages = 1;
762 return pages;
765 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
767 if (!migrate_release_ram() || !migration_in_postcopy()) {
768 return;
771 ram_discard_range(rbname, offset, pages << TARGET_PAGE_BITS);
775 * ram_save_page: send the given page to the stream
777 * Returns the number of pages written.
778 * < 0 - error
779 * >=0 - Number of pages written - this might legally be 0
780 * if xbzrle noticed the page was the same.
782 * @rs: current RAM state
783 * @block: block that contains the page we want to send
784 * @offset: offset inside the block for the page
785 * @last_stage: if we are at the completion stage
787 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
789 int pages = -1;
790 uint64_t bytes_xmit;
791 ram_addr_t current_addr;
792 uint8_t *p;
793 int ret;
794 bool send_async = true;
795 RAMBlock *block = pss->block;
796 ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
798 p = block->host + offset;
799 trace_ram_save_page(block->idstr, (uint64_t)offset, p);
801 /* In doubt sent page as normal */
802 bytes_xmit = 0;
803 ret = ram_control_save_page(rs->f, block->offset,
804 offset, TARGET_PAGE_SIZE, &bytes_xmit);
805 if (bytes_xmit) {
806 rs->bytes_transferred += bytes_xmit;
807 pages = 1;
810 XBZRLE_cache_lock();
812 current_addr = block->offset + offset;
814 if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
815 if (ret != RAM_SAVE_CONTROL_DELAYED) {
816 if (bytes_xmit > 0) {
817 rs->norm_pages++;
818 } else if (bytes_xmit == 0) {
819 rs->zero_pages++;
822 } else {
823 pages = save_zero_page(rs, block, offset, p);
824 if (pages > 0) {
825 /* Must let xbzrle know, otherwise a previous (now 0'd) cached
826 * page would be stale
828 xbzrle_cache_zero_page(rs, current_addr);
829 ram_release_pages(block->idstr, offset, pages);
830 } else if (!rs->ram_bulk_stage &&
831 !migration_in_postcopy() && migrate_use_xbzrle()) {
832 pages = save_xbzrle_page(rs, &p, current_addr, block,
833 offset, last_stage);
834 if (!last_stage) {
835 /* Can't send this cached data async, since the cache page
836 * might get updated before it gets to the wire
838 send_async = false;
843 /* XBZRLE overflow or normal page */
844 if (pages == -1) {
845 rs->bytes_transferred += save_page_header(rs, rs->f, block,
846 offset | RAM_SAVE_FLAG_PAGE);
847 if (send_async) {
848 qemu_put_buffer_async(rs->f, p, TARGET_PAGE_SIZE,
849 migrate_release_ram() &
850 migration_in_postcopy());
851 } else {
852 qemu_put_buffer(rs->f, p, TARGET_PAGE_SIZE);
854 rs->bytes_transferred += TARGET_PAGE_SIZE;
855 pages = 1;
856 rs->norm_pages++;
859 XBZRLE_cache_unlock();
861 return pages;
864 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
865 ram_addr_t offset)
867 RAMState *rs = &ram_state;
868 int bytes_sent, blen;
869 uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
871 bytes_sent = save_page_header(rs, f, block, offset |
872 RAM_SAVE_FLAG_COMPRESS_PAGE);
873 blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE,
874 migrate_compress_level());
875 if (blen < 0) {
876 bytes_sent = 0;
877 qemu_file_set_error(migrate_get_current()->to_dst_file, blen);
878 error_report("compressed data failed!");
879 } else {
880 bytes_sent += blen;
881 ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
884 return bytes_sent;
887 static void flush_compressed_data(RAMState *rs)
889 int idx, len, thread_count;
891 if (!migrate_use_compression()) {
892 return;
894 thread_count = migrate_compress_threads();
896 qemu_mutex_lock(&comp_done_lock);
897 for (idx = 0; idx < thread_count; idx++) {
898 while (!comp_param[idx].done) {
899 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
902 qemu_mutex_unlock(&comp_done_lock);
904 for (idx = 0; idx < thread_count; idx++) {
905 qemu_mutex_lock(&comp_param[idx].mutex);
906 if (!comp_param[idx].quit) {
907 len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
908 rs->bytes_transferred += len;
910 qemu_mutex_unlock(&comp_param[idx].mutex);
914 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
915 ram_addr_t offset)
917 param->block = block;
918 param->offset = offset;
921 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
922 ram_addr_t offset)
924 int idx, thread_count, bytes_xmit = -1, pages = -1;
926 thread_count = migrate_compress_threads();
927 qemu_mutex_lock(&comp_done_lock);
928 while (true) {
929 for (idx = 0; idx < thread_count; idx++) {
930 if (comp_param[idx].done) {
931 comp_param[idx].done = false;
932 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
933 qemu_mutex_lock(&comp_param[idx].mutex);
934 set_compress_params(&comp_param[idx], block, offset);
935 qemu_cond_signal(&comp_param[idx].cond);
936 qemu_mutex_unlock(&comp_param[idx].mutex);
937 pages = 1;
938 rs->norm_pages++;
939 rs->bytes_transferred += bytes_xmit;
940 break;
943 if (pages > 0) {
944 break;
945 } else {
946 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
949 qemu_mutex_unlock(&comp_done_lock);
951 return pages;
955 * ram_save_compressed_page: compress the given page and send it to the stream
957 * Returns the number of pages written.
959 * @rs: current RAM state
960 * @block: block that contains the page we want to send
961 * @offset: offset inside the block for the page
962 * @last_stage: if we are at the completion stage
964 static int ram_save_compressed_page(RAMState *rs, PageSearchStatus *pss,
965 bool last_stage)
967 int pages = -1;
968 uint64_t bytes_xmit = 0;
969 uint8_t *p;
970 int ret, blen;
971 RAMBlock *block = pss->block;
972 ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
974 p = block->host + offset;
976 ret = ram_control_save_page(rs->f, block->offset,
977 offset, TARGET_PAGE_SIZE, &bytes_xmit);
978 if (bytes_xmit) {
979 rs->bytes_transferred += bytes_xmit;
980 pages = 1;
982 if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
983 if (ret != RAM_SAVE_CONTROL_DELAYED) {
984 if (bytes_xmit > 0) {
985 rs->norm_pages++;
986 } else if (bytes_xmit == 0) {
987 rs->zero_pages++;
990 } else {
991 /* When starting the process of a new block, the first page of
992 * the block should be sent out before other pages in the same
993 * block, and all the pages in last block should have been sent
994 * out, keeping this order is important, because the 'cont' flag
995 * is used to avoid resending the block name.
997 if (block != rs->last_sent_block) {
998 flush_compressed_data(rs);
999 pages = save_zero_page(rs, block, offset, p);
1000 if (pages == -1) {
1001 /* Make sure the first page is sent out before other pages */
1002 bytes_xmit = save_page_header(rs, rs->f, block, offset |
1003 RAM_SAVE_FLAG_COMPRESS_PAGE);
1004 blen = qemu_put_compression_data(rs->f, p, TARGET_PAGE_SIZE,
1005 migrate_compress_level());
1006 if (blen > 0) {
1007 rs->bytes_transferred += bytes_xmit + blen;
1008 rs->norm_pages++;
1009 pages = 1;
1010 } else {
1011 qemu_file_set_error(rs->f, blen);
1012 error_report("compressed data failed!");
1015 if (pages > 0) {
1016 ram_release_pages(block->idstr, offset, pages);
1018 } else {
1019 pages = save_zero_page(rs, block, offset, p);
1020 if (pages == -1) {
1021 pages = compress_page_with_multi_thread(rs, block, offset);
1022 } else {
1023 ram_release_pages(block->idstr, offset, pages);
1028 return pages;
1032 * find_dirty_block: find the next dirty page and update any state
1033 * associated with the search process.
1035 * Returns if a page is found
1037 * @rs: current RAM state
1038 * @pss: data about the state of the current dirty page scan
1039 * @again: set to false if the search has scanned the whole of RAM
1041 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1043 pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1044 if (pss->complete_round && pss->block == rs->last_seen_block &&
1045 pss->page >= rs->last_page) {
1047 * We've been once around the RAM and haven't found anything.
1048 * Give up.
1050 *again = false;
1051 return false;
1053 if ((pss->page << TARGET_PAGE_BITS) >= pss->block->used_length) {
1054 /* Didn't find anything in this RAM Block */
1055 pss->page = 0;
1056 pss->block = QLIST_NEXT_RCU(pss->block, next);
1057 if (!pss->block) {
1058 /* Hit the end of the list */
1059 pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1060 /* Flag that we've looped */
1061 pss->complete_round = true;
1062 rs->ram_bulk_stage = false;
1063 if (migrate_use_xbzrle()) {
1064 /* If xbzrle is on, stop using the data compression at this
1065 * point. In theory, xbzrle can do better than compression.
1067 flush_compressed_data(rs);
1070 /* Didn't find anything this time, but try again on the new block */
1071 *again = true;
1072 return false;
1073 } else {
1074 /* Can go around again, but... */
1075 *again = true;
1076 /* We've found something so probably don't need to */
1077 return true;
1082 * unqueue_page: gets a page of the queue
1084 * Helper for 'get_queued_page' - gets a page off the queue
1086 * Returns the block of the page (or NULL if none available)
1088 * @rs: current RAM state
1089 * @offset: used to return the offset within the RAMBlock
1091 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1093 RAMBlock *block = NULL;
1095 qemu_mutex_lock(&rs->src_page_req_mutex);
1096 if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1097 struct RAMSrcPageRequest *entry =
1098 QSIMPLEQ_FIRST(&rs->src_page_requests);
1099 block = entry->rb;
1100 *offset = entry->offset;
1102 if (entry->len > TARGET_PAGE_SIZE) {
1103 entry->len -= TARGET_PAGE_SIZE;
1104 entry->offset += TARGET_PAGE_SIZE;
1105 } else {
1106 memory_region_unref(block->mr);
1107 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1108 g_free(entry);
1111 qemu_mutex_unlock(&rs->src_page_req_mutex);
1113 return block;
1117 * get_queued_page: unqueue a page from the postocpy requests
1119 * Skips pages that are already sent (!dirty)
1121 * Returns if a queued page is found
1123 * @rs: current RAM state
1124 * @pss: data about the state of the current dirty page scan
1126 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1128 RAMBlock *block;
1129 ram_addr_t offset;
1130 bool dirty;
1132 do {
1133 block = unqueue_page(rs, &offset);
1135 * We're sending this page, and since it's postcopy nothing else
1136 * will dirty it, and we must make sure it doesn't get sent again
1137 * even if this queue request was received after the background
1138 * search already sent it.
1140 if (block) {
1141 unsigned long page;
1143 page = offset >> TARGET_PAGE_BITS;
1144 dirty = test_bit(page, block->bmap);
1145 if (!dirty) {
1146 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1147 page, test_bit(page, block->unsentmap));
1148 } else {
1149 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1153 } while (block && !dirty);
1155 if (block) {
1157 * As soon as we start servicing pages out of order, then we have
1158 * to kill the bulk stage, since the bulk stage assumes
1159 * in (migration_bitmap_find_and_reset_dirty) that every page is
1160 * dirty, that's no longer true.
1162 rs->ram_bulk_stage = false;
1165 * We want the background search to continue from the queued page
1166 * since the guest is likely to want other pages near to the page
1167 * it just requested.
1169 pss->block = block;
1170 pss->page = offset >> TARGET_PAGE_BITS;
1173 return !!block;
1177 * migration_page_queue_free: drop any remaining pages in the ram
1178 * request queue
1180 * It should be empty at the end anyway, but in error cases there may
1181 * be some left. in case that there is any page left, we drop it.
1184 void migration_page_queue_free(void)
1186 struct RAMSrcPageRequest *mspr, *next_mspr;
1187 RAMState *rs = &ram_state;
1188 /* This queue generally should be empty - but in the case of a failed
1189 * migration might have some droppings in.
1191 rcu_read_lock();
1192 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1193 memory_region_unref(mspr->rb->mr);
1194 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1195 g_free(mspr);
1197 rcu_read_unlock();
1201 * ram_save_queue_pages: queue the page for transmission
1203 * A request from postcopy destination for example.
1205 * Returns zero on success or negative on error
1207 * @rbname: Name of the RAMBLock of the request. NULL means the
1208 * same that last one.
1209 * @start: starting address from the start of the RAMBlock
1210 * @len: length (in bytes) to send
1212 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
1214 RAMBlock *ramblock;
1215 RAMState *rs = &ram_state;
1217 rs->postcopy_requests++;
1218 rcu_read_lock();
1219 if (!rbname) {
1220 /* Reuse last RAMBlock */
1221 ramblock = rs->last_req_rb;
1223 if (!ramblock) {
1225 * Shouldn't happen, we can't reuse the last RAMBlock if
1226 * it's the 1st request.
1228 error_report("ram_save_queue_pages no previous block");
1229 goto err;
1231 } else {
1232 ramblock = qemu_ram_block_by_name(rbname);
1234 if (!ramblock) {
1235 /* We shouldn't be asked for a non-existent RAMBlock */
1236 error_report("ram_save_queue_pages no block '%s'", rbname);
1237 goto err;
1239 rs->last_req_rb = ramblock;
1241 trace_ram_save_queue_pages(ramblock->idstr, start, len);
1242 if (start+len > ramblock->used_length) {
1243 error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1244 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1245 __func__, start, len, ramblock->used_length);
1246 goto err;
1249 struct RAMSrcPageRequest *new_entry =
1250 g_malloc0(sizeof(struct RAMSrcPageRequest));
1251 new_entry->rb = ramblock;
1252 new_entry->offset = start;
1253 new_entry->len = len;
1255 memory_region_ref(ramblock->mr);
1256 qemu_mutex_lock(&rs->src_page_req_mutex);
1257 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
1258 qemu_mutex_unlock(&rs->src_page_req_mutex);
1259 rcu_read_unlock();
1261 return 0;
1263 err:
1264 rcu_read_unlock();
1265 return -1;
1269 * ram_save_target_page: save one target page
1271 * Returns the number of pages written
1273 * @rs: current RAM state
1274 * @ms: current migration state
1275 * @pss: data about the page we want to send
1276 * @last_stage: if we are at the completion stage
1278 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
1279 bool last_stage)
1281 int res = 0;
1283 /* Check the pages is dirty and if it is send it */
1284 if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
1286 * If xbzrle is on, stop using the data compression after first
1287 * round of migration even if compression is enabled. In theory,
1288 * xbzrle can do better than compression.
1290 if (migrate_use_compression() &&
1291 (rs->ram_bulk_stage || !migrate_use_xbzrle())) {
1292 res = ram_save_compressed_page(rs, pss, last_stage);
1293 } else {
1294 res = ram_save_page(rs, pss, last_stage);
1297 if (res < 0) {
1298 return res;
1300 if (pss->block->unsentmap) {
1301 clear_bit(pss->page, pss->block->unsentmap);
1305 return res;
1309 * ram_save_host_page: save a whole host page
1311 * Starting at *offset send pages up to the end of the current host
1312 * page. It's valid for the initial offset to point into the middle of
1313 * a host page in which case the remainder of the hostpage is sent.
1314 * Only dirty target pages are sent. Note that the host page size may
1315 * be a huge page for this block.
1316 * The saving stops at the boundary of the used_length of the block
1317 * if the RAMBlock isn't a multiple of the host page size.
1319 * Returns the number of pages written or negative on error
1321 * @rs: current RAM state
1322 * @ms: current migration state
1323 * @pss: data about the page we want to send
1324 * @last_stage: if we are at the completion stage
1326 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
1327 bool last_stage)
1329 int tmppages, pages = 0;
1330 size_t pagesize_bits =
1331 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
1333 do {
1334 tmppages = ram_save_target_page(rs, pss, last_stage);
1335 if (tmppages < 0) {
1336 return tmppages;
1339 pages += tmppages;
1340 pss->page++;
1341 } while ((pss->page & (pagesize_bits - 1)) &&
1342 offset_in_ramblock(pss->block, pss->page << TARGET_PAGE_BITS));
1344 /* The offset we leave with is the last one we looked at */
1345 pss->page--;
1346 return pages;
1350 * ram_find_and_save_block: finds a dirty page and sends it to f
1352 * Called within an RCU critical section.
1354 * Returns the number of pages written where zero means no dirty pages
1356 * @rs: current RAM state
1357 * @last_stage: if we are at the completion stage
1359 * On systems where host-page-size > target-page-size it will send all the
1360 * pages in a host page that are dirty.
1363 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
1365 PageSearchStatus pss;
1366 int pages = 0;
1367 bool again, found;
1369 /* No dirty page as there is zero RAM */
1370 if (!ram_bytes_total()) {
1371 return pages;
1374 pss.block = rs->last_seen_block;
1375 pss.page = rs->last_page;
1376 pss.complete_round = false;
1378 if (!pss.block) {
1379 pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1382 do {
1383 again = true;
1384 found = get_queued_page(rs, &pss);
1386 if (!found) {
1387 /* priority queue empty, so just search for something dirty */
1388 found = find_dirty_block(rs, &pss, &again);
1391 if (found) {
1392 pages = ram_save_host_page(rs, &pss, last_stage);
1394 } while (!pages && again);
1396 rs->last_seen_block = pss.block;
1397 rs->last_page = pss.page;
1399 return pages;
1402 void acct_update_position(QEMUFile *f, size_t size, bool zero)
1404 uint64_t pages = size / TARGET_PAGE_SIZE;
1405 RAMState *rs = &ram_state;
1407 if (zero) {
1408 rs->zero_pages += pages;
1409 } else {
1410 rs->norm_pages += pages;
1411 rs->bytes_transferred += size;
1412 qemu_update_position(f, size);
1416 uint64_t ram_bytes_total(void)
1418 RAMBlock *block;
1419 uint64_t total = 0;
1421 rcu_read_lock();
1422 RAMBLOCK_FOREACH(block) {
1423 total += block->used_length;
1425 rcu_read_unlock();
1426 return total;
1429 void free_xbzrle_decoded_buf(void)
1431 g_free(xbzrle_decoded_buf);
1432 xbzrle_decoded_buf = NULL;
1435 static void ram_migration_cleanup(void *opaque)
1437 RAMBlock *block;
1439 /* caller have hold iothread lock or is in a bh, so there is
1440 * no writing race against this migration_bitmap
1442 memory_global_dirty_log_stop();
1444 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1445 g_free(block->bmap);
1446 block->bmap = NULL;
1447 g_free(block->unsentmap);
1448 block->unsentmap = NULL;
1451 XBZRLE_cache_lock();
1452 if (XBZRLE.cache) {
1453 cache_fini(XBZRLE.cache);
1454 g_free(XBZRLE.encoded_buf);
1455 g_free(XBZRLE.current_buf);
1456 g_free(ZERO_TARGET_PAGE);
1457 XBZRLE.cache = NULL;
1458 XBZRLE.encoded_buf = NULL;
1459 XBZRLE.current_buf = NULL;
1461 XBZRLE_cache_unlock();
1464 static void ram_state_reset(RAMState *rs)
1466 rs->last_seen_block = NULL;
1467 rs->last_sent_block = NULL;
1468 rs->last_page = 0;
1469 rs->last_version = ram_list.version;
1470 rs->ram_bulk_stage = true;
1473 #define MAX_WAIT 50 /* ms, half buffered_file limit */
1476 * 'expected' is the value you expect the bitmap mostly to be full
1477 * of; it won't bother printing lines that are all this value.
1478 * If 'todump' is null the migration bitmap is dumped.
1480 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
1481 unsigned long pages)
1483 int64_t cur;
1484 int64_t linelen = 128;
1485 char linebuf[129];
1487 for (cur = 0; cur < pages; cur += linelen) {
1488 int64_t curb;
1489 bool found = false;
1491 * Last line; catch the case where the line length
1492 * is longer than remaining ram
1494 if (cur + linelen > pages) {
1495 linelen = pages - cur;
1497 for (curb = 0; curb < linelen; curb++) {
1498 bool thisbit = test_bit(cur + curb, todump);
1499 linebuf[curb] = thisbit ? '1' : '.';
1500 found = found || (thisbit != expected);
1502 if (found) {
1503 linebuf[curb] = '\0';
1504 fprintf(stderr, "0x%08" PRIx64 " : %s\n", cur, linebuf);
1509 /* **** functions for postcopy ***** */
1511 void ram_postcopy_migrated_memory_release(MigrationState *ms)
1513 struct RAMBlock *block;
1515 RAMBLOCK_FOREACH(block) {
1516 unsigned long *bitmap = block->bmap;
1517 unsigned long range = block->used_length >> TARGET_PAGE_BITS;
1518 unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
1520 while (run_start < range) {
1521 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
1522 ram_discard_range(block->idstr, run_start << TARGET_PAGE_BITS,
1523 (run_end - run_start) << TARGET_PAGE_BITS);
1524 run_start = find_next_zero_bit(bitmap, range, run_end + 1);
1530 * postcopy_send_discard_bm_ram: discard a RAMBlock
1532 * Returns zero on success
1534 * Callback from postcopy_each_ram_send_discard for each RAMBlock
1535 * Note: At this point the 'unsentmap' is the processed bitmap combined
1536 * with the dirtymap; so a '1' means it's either dirty or unsent.
1538 * @ms: current migration state
1539 * @pds: state for postcopy
1540 * @start: RAMBlock starting page
1541 * @length: RAMBlock size
1543 static int postcopy_send_discard_bm_ram(MigrationState *ms,
1544 PostcopyDiscardState *pds,
1545 RAMBlock *block)
1547 unsigned long end = block->used_length >> TARGET_PAGE_BITS;
1548 unsigned long current;
1549 unsigned long *unsentmap = block->unsentmap;
1551 for (current = 0; current < end; ) {
1552 unsigned long one = find_next_bit(unsentmap, end, current);
1554 if (one <= end) {
1555 unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
1556 unsigned long discard_length;
1558 if (zero >= end) {
1559 discard_length = end - one;
1560 } else {
1561 discard_length = zero - one;
1563 if (discard_length) {
1564 postcopy_discard_send_range(ms, pds, one, discard_length);
1566 current = one + discard_length;
1567 } else {
1568 current = one;
1572 return 0;
1576 * postcopy_each_ram_send_discard: discard all RAMBlocks
1578 * Returns 0 for success or negative for error
1580 * Utility for the outgoing postcopy code.
1581 * Calls postcopy_send_discard_bm_ram for each RAMBlock
1582 * passing it bitmap indexes and name.
1583 * (qemu_ram_foreach_block ends up passing unscaled lengths
1584 * which would mean postcopy code would have to deal with target page)
1586 * @ms: current migration state
1588 static int postcopy_each_ram_send_discard(MigrationState *ms)
1590 struct RAMBlock *block;
1591 int ret;
1593 RAMBLOCK_FOREACH(block) {
1594 PostcopyDiscardState *pds =
1595 postcopy_discard_send_init(ms, block->idstr);
1598 * Postcopy sends chunks of bitmap over the wire, but it
1599 * just needs indexes at this point, avoids it having
1600 * target page specific code.
1602 ret = postcopy_send_discard_bm_ram(ms, pds, block);
1603 postcopy_discard_send_finish(ms, pds);
1604 if (ret) {
1605 return ret;
1609 return 0;
1613 * postcopy_chunk_hostpages_pass: canocalize bitmap in hostpages
1615 * Helper for postcopy_chunk_hostpages; it's called twice to
1616 * canonicalize the two bitmaps, that are similar, but one is
1617 * inverted.
1619 * Postcopy requires that all target pages in a hostpage are dirty or
1620 * clean, not a mix. This function canonicalizes the bitmaps.
1622 * @ms: current migration state
1623 * @unsent_pass: if true we need to canonicalize partially unsent host pages
1624 * otherwise we need to canonicalize partially dirty host pages
1625 * @block: block that contains the page we want to canonicalize
1626 * @pds: state for postcopy
1628 static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
1629 RAMBlock *block,
1630 PostcopyDiscardState *pds)
1632 RAMState *rs = &ram_state;
1633 unsigned long *bitmap = block->bmap;
1634 unsigned long *unsentmap = block->unsentmap;
1635 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
1636 unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
1637 unsigned long run_start;
1639 if (block->page_size == TARGET_PAGE_SIZE) {
1640 /* Easy case - TPS==HPS for a non-huge page RAMBlock */
1641 return;
1644 if (unsent_pass) {
1645 /* Find a sent page */
1646 run_start = find_next_zero_bit(unsentmap, pages, 0);
1647 } else {
1648 /* Find a dirty page */
1649 run_start = find_next_bit(bitmap, pages, 0);
1652 while (run_start < pages) {
1653 bool do_fixup = false;
1654 unsigned long fixup_start_addr;
1655 unsigned long host_offset;
1658 * If the start of this run of pages is in the middle of a host
1659 * page, then we need to fixup this host page.
1661 host_offset = run_start % host_ratio;
1662 if (host_offset) {
1663 do_fixup = true;
1664 run_start -= host_offset;
1665 fixup_start_addr = run_start;
1666 /* For the next pass */
1667 run_start = run_start + host_ratio;
1668 } else {
1669 /* Find the end of this run */
1670 unsigned long run_end;
1671 if (unsent_pass) {
1672 run_end = find_next_bit(unsentmap, pages, run_start + 1);
1673 } else {
1674 run_end = find_next_zero_bit(bitmap, pages, run_start + 1);
1677 * If the end isn't at the start of a host page, then the
1678 * run doesn't finish at the end of a host page
1679 * and we need to discard.
1681 host_offset = run_end % host_ratio;
1682 if (host_offset) {
1683 do_fixup = true;
1684 fixup_start_addr = run_end - host_offset;
1686 * This host page has gone, the next loop iteration starts
1687 * from after the fixup
1689 run_start = fixup_start_addr + host_ratio;
1690 } else {
1692 * No discards on this iteration, next loop starts from
1693 * next sent/dirty page
1695 run_start = run_end + 1;
1699 if (do_fixup) {
1700 unsigned long page;
1702 /* Tell the destination to discard this page */
1703 if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
1704 /* For the unsent_pass we:
1705 * discard partially sent pages
1706 * For the !unsent_pass (dirty) we:
1707 * discard partially dirty pages that were sent
1708 * (any partially sent pages were already discarded
1709 * by the previous unsent_pass)
1711 postcopy_discard_send_range(ms, pds, fixup_start_addr,
1712 host_ratio);
1715 /* Clean up the bitmap */
1716 for (page = fixup_start_addr;
1717 page < fixup_start_addr + host_ratio; page++) {
1718 /* All pages in this host page are now not sent */
1719 set_bit(page, unsentmap);
1722 * Remark them as dirty, updating the count for any pages
1723 * that weren't previously dirty.
1725 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
1729 if (unsent_pass) {
1730 /* Find the next sent page for the next iteration */
1731 run_start = find_next_zero_bit(unsentmap, pages, run_start);
1732 } else {
1733 /* Find the next dirty page for the next iteration */
1734 run_start = find_next_bit(bitmap, pages, run_start);
1740 * postcopy_chuck_hostpages: discrad any partially sent host page
1742 * Utility for the outgoing postcopy code.
1744 * Discard any partially sent host-page size chunks, mark any partially
1745 * dirty host-page size chunks as all dirty. In this case the host-page
1746 * is the host-page for the particular RAMBlock, i.e. it might be a huge page
1748 * Returns zero on success
1750 * @ms: current migration state
1751 * @block: block we want to work with
1753 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
1755 PostcopyDiscardState *pds =
1756 postcopy_discard_send_init(ms, block->idstr);
1758 /* First pass: Discard all partially sent host pages */
1759 postcopy_chunk_hostpages_pass(ms, true, block, pds);
1761 * Second pass: Ensure that all partially dirty host pages are made
1762 * fully dirty.
1764 postcopy_chunk_hostpages_pass(ms, false, block, pds);
1766 postcopy_discard_send_finish(ms, pds);
1767 return 0;
1771 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
1773 * Returns zero on success
1775 * Transmit the set of pages to be discarded after precopy to the target
1776 * these are pages that:
1777 * a) Have been previously transmitted but are now dirty again
1778 * b) Pages that have never been transmitted, this ensures that
1779 * any pages on the destination that have been mapped by background
1780 * tasks get discarded (transparent huge pages is the specific concern)
1781 * Hopefully this is pretty sparse
1783 * @ms: current migration state
1785 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
1787 RAMState *rs = &ram_state;
1788 RAMBlock *block;
1789 int ret;
1791 rcu_read_lock();
1793 /* This should be our last sync, the src is now paused */
1794 migration_bitmap_sync(rs);
1796 /* Easiest way to make sure we don't resume in the middle of a host-page */
1797 rs->last_seen_block = NULL;
1798 rs->last_sent_block = NULL;
1799 rs->last_page = 0;
1801 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1802 unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
1803 unsigned long *bitmap = block->bmap;
1804 unsigned long *unsentmap = block->unsentmap;
1806 if (!unsentmap) {
1807 /* We don't have a safe way to resize the sentmap, so
1808 * if the bitmap was resized it will be NULL at this
1809 * point.
1811 error_report("migration ram resized during precopy phase");
1812 rcu_read_unlock();
1813 return -EINVAL;
1815 /* Deal with TPS != HPS and huge pages */
1816 ret = postcopy_chunk_hostpages(ms, block);
1817 if (ret) {
1818 rcu_read_unlock();
1819 return ret;
1823 * Update the unsentmap to be unsentmap = unsentmap | dirty
1825 bitmap_or(unsentmap, unsentmap, bitmap, pages);
1826 #ifdef DEBUG_POSTCOPY
1827 ram_debug_dump_bitmap(unsentmap, true, pages);
1828 #endif
1830 trace_ram_postcopy_send_discard_bitmap();
1832 ret = postcopy_each_ram_send_discard(ms);
1833 rcu_read_unlock();
1835 return ret;
1839 * ram_discard_range: discard dirtied pages at the beginning of postcopy
1841 * Returns zero on success
1843 * @rbname: name of the RAMBlock of the request. NULL means the
1844 * same that last one.
1845 * @start: RAMBlock starting page
1846 * @length: RAMBlock size
1848 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
1850 int ret = -1;
1852 trace_ram_discard_range(rbname, start, length);
1854 rcu_read_lock();
1855 RAMBlock *rb = qemu_ram_block_by_name(rbname);
1857 if (!rb) {
1858 error_report("ram_discard_range: Failed to find block '%s'", rbname);
1859 goto err;
1862 ret = ram_block_discard_range(rb, start, length);
1864 err:
1865 rcu_read_unlock();
1867 return ret;
1870 static int ram_state_init(RAMState *rs)
1872 memset(rs, 0, sizeof(*rs));
1873 qemu_mutex_init(&rs->bitmap_mutex);
1874 qemu_mutex_init(&rs->src_page_req_mutex);
1875 QSIMPLEQ_INIT(&rs->src_page_requests);
1877 if (migrate_use_xbzrle()) {
1878 XBZRLE_cache_lock();
1879 ZERO_TARGET_PAGE = g_malloc0(TARGET_PAGE_SIZE);
1880 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size() /
1881 TARGET_PAGE_SIZE,
1882 TARGET_PAGE_SIZE);
1883 if (!XBZRLE.cache) {
1884 XBZRLE_cache_unlock();
1885 error_report("Error creating cache");
1886 return -1;
1888 XBZRLE_cache_unlock();
1890 /* We prefer not to abort if there is no memory */
1891 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
1892 if (!XBZRLE.encoded_buf) {
1893 error_report("Error allocating encoded_buf");
1894 return -1;
1897 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
1898 if (!XBZRLE.current_buf) {
1899 error_report("Error allocating current_buf");
1900 g_free(XBZRLE.encoded_buf);
1901 XBZRLE.encoded_buf = NULL;
1902 return -1;
1906 /* For memory_global_dirty_log_start below. */
1907 qemu_mutex_lock_iothread();
1909 qemu_mutex_lock_ramlist();
1910 rcu_read_lock();
1911 ram_state_reset(rs);
1913 /* Skip setting bitmap if there is no RAM */
1914 if (ram_bytes_total()) {
1915 RAMBlock *block;
1917 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1918 unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
1920 block->bmap = bitmap_new(pages);
1921 bitmap_set(block->bmap, 0, pages);
1922 if (migrate_postcopy_ram()) {
1923 block->unsentmap = bitmap_new(pages);
1924 bitmap_set(block->unsentmap, 0, pages);
1930 * Count the total number of pages used by ram blocks not including any
1931 * gaps due to alignment or unplugs.
1933 rs->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
1935 memory_global_dirty_log_start();
1936 migration_bitmap_sync(rs);
1937 qemu_mutex_unlock_ramlist();
1938 qemu_mutex_unlock_iothread();
1939 rcu_read_unlock();
1941 return 0;
1945 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
1946 * long-running RCU critical section. When rcu-reclaims in the code
1947 * start to become numerous it will be necessary to reduce the
1948 * granularity of these critical sections.
1952 * ram_save_setup: Setup RAM for migration
1954 * Returns zero to indicate success and negative for error
1956 * @f: QEMUFile where to send the data
1957 * @opaque: RAMState pointer
1959 static int ram_save_setup(QEMUFile *f, void *opaque)
1961 RAMState *rs = opaque;
1962 RAMBlock *block;
1964 /* migration has already setup the bitmap, reuse it. */
1965 if (!migration_in_colo_state()) {
1966 if (ram_state_init(rs) < 0) {
1967 return -1;
1970 rs->f = f;
1972 rcu_read_lock();
1974 qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
1976 RAMBLOCK_FOREACH(block) {
1977 qemu_put_byte(f, strlen(block->idstr));
1978 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
1979 qemu_put_be64(f, block->used_length);
1980 if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) {
1981 qemu_put_be64(f, block->page_size);
1985 rcu_read_unlock();
1987 ram_control_before_iterate(f, RAM_CONTROL_SETUP);
1988 ram_control_after_iterate(f, RAM_CONTROL_SETUP);
1990 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
1992 return 0;
1996 * ram_save_iterate: iterative stage for migration
1998 * Returns zero to indicate success and negative for error
2000 * @f: QEMUFile where to send the data
2001 * @opaque: RAMState pointer
2003 static int ram_save_iterate(QEMUFile *f, void *opaque)
2005 RAMState *rs = opaque;
2006 int ret;
2007 int i;
2008 int64_t t0;
2009 int done = 0;
2011 rcu_read_lock();
2012 if (ram_list.version != rs->last_version) {
2013 ram_state_reset(rs);
2016 /* Read version before ram_list.blocks */
2017 smp_rmb();
2019 ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2021 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2022 i = 0;
2023 while ((ret = qemu_file_rate_limit(f)) == 0) {
2024 int pages;
2026 pages = ram_find_and_save_block(rs, false);
2027 /* no more pages to sent */
2028 if (pages == 0) {
2029 done = 1;
2030 break;
2032 rs->iterations++;
2034 /* we want to check in the 1st loop, just in case it was the 1st time
2035 and we had to sync the dirty bitmap.
2036 qemu_get_clock_ns() is a bit expensive, so we only check each some
2037 iterations
2039 if ((i & 63) == 0) {
2040 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
2041 if (t1 > MAX_WAIT) {
2042 trace_ram_save_iterate_big_wait(t1, i);
2043 break;
2046 i++;
2048 flush_compressed_data(rs);
2049 rcu_read_unlock();
2052 * Must occur before EOS (or any QEMUFile operation)
2053 * because of RDMA protocol.
2055 ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2057 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2058 rs->bytes_transferred += 8;
2060 ret = qemu_file_get_error(f);
2061 if (ret < 0) {
2062 return ret;
2065 return done;
2069 * ram_save_complete: function called to send the remaining amount of ram
2071 * Returns zero to indicate success
2073 * Called with iothread lock
2075 * @f: QEMUFile where to send the data
2076 * @opaque: RAMState pointer
2078 static int ram_save_complete(QEMUFile *f, void *opaque)
2080 RAMState *rs = opaque;
2082 rcu_read_lock();
2084 if (!migration_in_postcopy()) {
2085 migration_bitmap_sync(rs);
2088 ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2090 /* try transferring iterative blocks of memory */
2092 /* flush all remaining blocks regardless of rate limiting */
2093 while (true) {
2094 int pages;
2096 pages = ram_find_and_save_block(rs, !migration_in_colo_state());
2097 /* no more blocks to sent */
2098 if (pages == 0) {
2099 break;
2103 flush_compressed_data(rs);
2104 ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2106 rcu_read_unlock();
2108 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2110 return 0;
2113 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2114 uint64_t *non_postcopiable_pending,
2115 uint64_t *postcopiable_pending)
2117 RAMState *rs = opaque;
2118 uint64_t remaining_size;
2120 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2122 if (!migration_in_postcopy() &&
2123 remaining_size < max_size) {
2124 qemu_mutex_lock_iothread();
2125 rcu_read_lock();
2126 migration_bitmap_sync(rs);
2127 rcu_read_unlock();
2128 qemu_mutex_unlock_iothread();
2129 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2132 /* We can do postcopy, and all the data is postcopiable */
2133 *postcopiable_pending += remaining_size;
2136 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2138 unsigned int xh_len;
2139 int xh_flags;
2140 uint8_t *loaded_data;
2142 if (!xbzrle_decoded_buf) {
2143 xbzrle_decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2145 loaded_data = xbzrle_decoded_buf;
2147 /* extract RLE header */
2148 xh_flags = qemu_get_byte(f);
2149 xh_len = qemu_get_be16(f);
2151 if (xh_flags != ENCODING_FLAG_XBZRLE) {
2152 error_report("Failed to load XBZRLE page - wrong compression!");
2153 return -1;
2156 if (xh_len > TARGET_PAGE_SIZE) {
2157 error_report("Failed to load XBZRLE page - len overflow!");
2158 return -1;
2160 /* load data and decode */
2161 qemu_get_buffer_in_place(f, &loaded_data, xh_len);
2163 /* decode RLE */
2164 if (xbzrle_decode_buffer(loaded_data, xh_len, host,
2165 TARGET_PAGE_SIZE) == -1) {
2166 error_report("Failed to load XBZRLE page - decode error!");
2167 return -1;
2170 return 0;
2174 * ram_block_from_stream: read a RAMBlock id from the migration stream
2176 * Must be called from within a rcu critical section.
2178 * Returns a pointer from within the RCU-protected ram_list.
2180 * @f: QEMUFile where to read the data from
2181 * @flags: Page flags (mostly to see if it's a continuation of previous block)
2183 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
2185 static RAMBlock *block = NULL;
2186 char id[256];
2187 uint8_t len;
2189 if (flags & RAM_SAVE_FLAG_CONTINUE) {
2190 if (!block) {
2191 error_report("Ack, bad migration stream!");
2192 return NULL;
2194 return block;
2197 len = qemu_get_byte(f);
2198 qemu_get_buffer(f, (uint8_t *)id, len);
2199 id[len] = 0;
2201 block = qemu_ram_block_by_name(id);
2202 if (!block) {
2203 error_report("Can't find block %s", id);
2204 return NULL;
2207 return block;
2210 static inline void *host_from_ram_block_offset(RAMBlock *block,
2211 ram_addr_t offset)
2213 if (!offset_in_ramblock(block, offset)) {
2214 return NULL;
2217 return block->host + offset;
2221 * ram_handle_compressed: handle the zero page case
2223 * If a page (or a whole RDMA chunk) has been
2224 * determined to be zero, then zap it.
2226 * @host: host address for the zero page
2227 * @ch: what the page is filled from. We only support zero
2228 * @size: size of the zero page
2230 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2232 if (ch != 0 || !is_zero_range(host, size)) {
2233 memset(host, ch, size);
2237 static void *do_data_decompress(void *opaque)
2239 DecompressParam *param = opaque;
2240 unsigned long pagesize;
2241 uint8_t *des;
2242 int len;
2244 qemu_mutex_lock(&param->mutex);
2245 while (!param->quit) {
2246 if (param->des) {
2247 des = param->des;
2248 len = param->len;
2249 param->des = 0;
2250 qemu_mutex_unlock(&param->mutex);
2252 pagesize = TARGET_PAGE_SIZE;
2253 /* uncompress() will return failed in some case, especially
2254 * when the page is dirted when doing the compression, it's
2255 * not a problem because the dirty page will be retransferred
2256 * and uncompress() won't break the data in other pages.
2258 uncompress((Bytef *)des, &pagesize,
2259 (const Bytef *)param->compbuf, len);
2261 qemu_mutex_lock(&decomp_done_lock);
2262 param->done = true;
2263 qemu_cond_signal(&decomp_done_cond);
2264 qemu_mutex_unlock(&decomp_done_lock);
2266 qemu_mutex_lock(&param->mutex);
2267 } else {
2268 qemu_cond_wait(&param->cond, &param->mutex);
2271 qemu_mutex_unlock(&param->mutex);
2273 return NULL;
2276 static void wait_for_decompress_done(void)
2278 int idx, thread_count;
2280 if (!migrate_use_compression()) {
2281 return;
2284 thread_count = migrate_decompress_threads();
2285 qemu_mutex_lock(&decomp_done_lock);
2286 for (idx = 0; idx < thread_count; idx++) {
2287 while (!decomp_param[idx].done) {
2288 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2291 qemu_mutex_unlock(&decomp_done_lock);
2294 void migrate_decompress_threads_create(void)
2296 int i, thread_count;
2298 thread_count = migrate_decompress_threads();
2299 decompress_threads = g_new0(QemuThread, thread_count);
2300 decomp_param = g_new0(DecompressParam, thread_count);
2301 qemu_mutex_init(&decomp_done_lock);
2302 qemu_cond_init(&decomp_done_cond);
2303 for (i = 0; i < thread_count; i++) {
2304 qemu_mutex_init(&decomp_param[i].mutex);
2305 qemu_cond_init(&decomp_param[i].cond);
2306 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
2307 decomp_param[i].done = true;
2308 decomp_param[i].quit = false;
2309 qemu_thread_create(decompress_threads + i, "decompress",
2310 do_data_decompress, decomp_param + i,
2311 QEMU_THREAD_JOINABLE);
2315 void migrate_decompress_threads_join(void)
2317 int i, thread_count;
2319 thread_count = migrate_decompress_threads();
2320 for (i = 0; i < thread_count; i++) {
2321 qemu_mutex_lock(&decomp_param[i].mutex);
2322 decomp_param[i].quit = true;
2323 qemu_cond_signal(&decomp_param[i].cond);
2324 qemu_mutex_unlock(&decomp_param[i].mutex);
2326 for (i = 0; i < thread_count; i++) {
2327 qemu_thread_join(decompress_threads + i);
2328 qemu_mutex_destroy(&decomp_param[i].mutex);
2329 qemu_cond_destroy(&decomp_param[i].cond);
2330 g_free(decomp_param[i].compbuf);
2332 g_free(decompress_threads);
2333 g_free(decomp_param);
2334 decompress_threads = NULL;
2335 decomp_param = NULL;
2338 static void decompress_data_with_multi_threads(QEMUFile *f,
2339 void *host, int len)
2341 int idx, thread_count;
2343 thread_count = migrate_decompress_threads();
2344 qemu_mutex_lock(&decomp_done_lock);
2345 while (true) {
2346 for (idx = 0; idx < thread_count; idx++) {
2347 if (decomp_param[idx].done) {
2348 decomp_param[idx].done = false;
2349 qemu_mutex_lock(&decomp_param[idx].mutex);
2350 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
2351 decomp_param[idx].des = host;
2352 decomp_param[idx].len = len;
2353 qemu_cond_signal(&decomp_param[idx].cond);
2354 qemu_mutex_unlock(&decomp_param[idx].mutex);
2355 break;
2358 if (idx < thread_count) {
2359 break;
2360 } else {
2361 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2364 qemu_mutex_unlock(&decomp_done_lock);
2368 * ram_postcopy_incoming_init: allocate postcopy data structures
2370 * Returns 0 for success and negative if there was one error
2372 * @mis: current migration incoming state
2374 * Allocate data structures etc needed by incoming migration with
2375 * postcopy-ram. postcopy-ram's similarly names
2376 * postcopy_ram_incoming_init does the work.
2378 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
2380 unsigned long ram_pages = last_ram_page();
2382 return postcopy_ram_incoming_init(mis, ram_pages);
2386 * ram_load_postcopy: load a page in postcopy case
2388 * Returns 0 for success or -errno in case of error
2390 * Called in postcopy mode by ram_load().
2391 * rcu_read_lock is taken prior to this being called.
2393 * @f: QEMUFile where to send the data
2395 static int ram_load_postcopy(QEMUFile *f)
2397 int flags = 0, ret = 0;
2398 bool place_needed = false;
2399 bool matching_page_sizes = false;
2400 MigrationIncomingState *mis = migration_incoming_get_current();
2401 /* Temporary page that is later 'placed' */
2402 void *postcopy_host_page = postcopy_get_tmp_page(mis);
2403 void *last_host = NULL;
2404 bool all_zero = false;
2406 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2407 ram_addr_t addr;
2408 void *host = NULL;
2409 void *page_buffer = NULL;
2410 void *place_source = NULL;
2411 RAMBlock *block = NULL;
2412 uint8_t ch;
2414 addr = qemu_get_be64(f);
2415 flags = addr & ~TARGET_PAGE_MASK;
2416 addr &= TARGET_PAGE_MASK;
2418 trace_ram_load_postcopy_loop((uint64_t)addr, flags);
2419 place_needed = false;
2420 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE)) {
2421 block = ram_block_from_stream(f, flags);
2423 host = host_from_ram_block_offset(block, addr);
2424 if (!host) {
2425 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2426 ret = -EINVAL;
2427 break;
2429 matching_page_sizes = block->page_size == TARGET_PAGE_SIZE;
2431 * Postcopy requires that we place whole host pages atomically;
2432 * these may be huge pages for RAMBlocks that are backed by
2433 * hugetlbfs.
2434 * To make it atomic, the data is read into a temporary page
2435 * that's moved into place later.
2436 * The migration protocol uses, possibly smaller, target-pages
2437 * however the source ensures it always sends all the components
2438 * of a host page in order.
2440 page_buffer = postcopy_host_page +
2441 ((uintptr_t)host & (block->page_size - 1));
2442 /* If all TP are zero then we can optimise the place */
2443 if (!((uintptr_t)host & (block->page_size - 1))) {
2444 all_zero = true;
2445 } else {
2446 /* not the 1st TP within the HP */
2447 if (host != (last_host + TARGET_PAGE_SIZE)) {
2448 error_report("Non-sequential target page %p/%p",
2449 host, last_host);
2450 ret = -EINVAL;
2451 break;
2457 * If it's the last part of a host page then we place the host
2458 * page
2460 place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
2461 (block->page_size - 1)) == 0;
2462 place_source = postcopy_host_page;
2464 last_host = host;
2466 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2467 case RAM_SAVE_FLAG_ZERO:
2468 ch = qemu_get_byte(f);
2469 memset(page_buffer, ch, TARGET_PAGE_SIZE);
2470 if (ch) {
2471 all_zero = false;
2473 break;
2475 case RAM_SAVE_FLAG_PAGE:
2476 all_zero = false;
2477 if (!place_needed || !matching_page_sizes) {
2478 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
2479 } else {
2480 /* Avoids the qemu_file copy during postcopy, which is
2481 * going to do a copy later; can only do it when we
2482 * do this read in one go (matching page sizes)
2484 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
2485 TARGET_PAGE_SIZE);
2487 break;
2488 case RAM_SAVE_FLAG_EOS:
2489 /* normal exit */
2490 break;
2491 default:
2492 error_report("Unknown combination of migration flags: %#x"
2493 " (postcopy mode)", flags);
2494 ret = -EINVAL;
2497 if (place_needed) {
2498 /* This gets called at the last target page in the host page */
2499 void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;
2501 if (all_zero) {
2502 ret = postcopy_place_page_zero(mis, place_dest,
2503 block->page_size);
2504 } else {
2505 ret = postcopy_place_page(mis, place_dest,
2506 place_source, block->page_size);
2509 if (!ret) {
2510 ret = qemu_file_get_error(f);
2514 return ret;
2517 static int ram_load(QEMUFile *f, void *opaque, int version_id)
2519 int flags = 0, ret = 0;
2520 static uint64_t seq_iter;
2521 int len = 0;
2523 * If system is running in postcopy mode, page inserts to host memory must
2524 * be atomic
2526 bool postcopy_running = postcopy_state_get() >= POSTCOPY_INCOMING_LISTENING;
2527 /* ADVISE is earlier, it shows the source has the postcopy capability on */
2528 bool postcopy_advised = postcopy_state_get() >= POSTCOPY_INCOMING_ADVISE;
2530 seq_iter++;
2532 if (version_id != 4) {
2533 ret = -EINVAL;
2536 /* This RCU critical section can be very long running.
2537 * When RCU reclaims in the code start to become numerous,
2538 * it will be necessary to reduce the granularity of this
2539 * critical section.
2541 rcu_read_lock();
2543 if (postcopy_running) {
2544 ret = ram_load_postcopy(f);
2547 while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2548 ram_addr_t addr, total_ram_bytes;
2549 void *host = NULL;
2550 uint8_t ch;
2552 addr = qemu_get_be64(f);
2553 flags = addr & ~TARGET_PAGE_MASK;
2554 addr &= TARGET_PAGE_MASK;
2556 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
2557 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
2558 RAMBlock *block = ram_block_from_stream(f, flags);
2560 host = host_from_ram_block_offset(block, addr);
2561 if (!host) {
2562 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2563 ret = -EINVAL;
2564 break;
2566 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
2569 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2570 case RAM_SAVE_FLAG_MEM_SIZE:
2571 /* Synchronize RAM block list */
2572 total_ram_bytes = addr;
2573 while (!ret && total_ram_bytes) {
2574 RAMBlock *block;
2575 char id[256];
2576 ram_addr_t length;
2578 len = qemu_get_byte(f);
2579 qemu_get_buffer(f, (uint8_t *)id, len);
2580 id[len] = 0;
2581 length = qemu_get_be64(f);
2583 block = qemu_ram_block_by_name(id);
2584 if (block) {
2585 if (length != block->used_length) {
2586 Error *local_err = NULL;
2588 ret = qemu_ram_resize(block, length,
2589 &local_err);
2590 if (local_err) {
2591 error_report_err(local_err);
2594 /* For postcopy we need to check hugepage sizes match */
2595 if (postcopy_advised &&
2596 block->page_size != qemu_host_page_size) {
2597 uint64_t remote_page_size = qemu_get_be64(f);
2598 if (remote_page_size != block->page_size) {
2599 error_report("Mismatched RAM page size %s "
2600 "(local) %zd != %" PRId64,
2601 id, block->page_size,
2602 remote_page_size);
2603 ret = -EINVAL;
2606 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
2607 block->idstr);
2608 } else {
2609 error_report("Unknown ramblock \"%s\", cannot "
2610 "accept migration", id);
2611 ret = -EINVAL;
2614 total_ram_bytes -= length;
2616 break;
2618 case RAM_SAVE_FLAG_ZERO:
2619 ch = qemu_get_byte(f);
2620 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
2621 break;
2623 case RAM_SAVE_FLAG_PAGE:
2624 qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
2625 break;
2627 case RAM_SAVE_FLAG_COMPRESS_PAGE:
2628 len = qemu_get_be32(f);
2629 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
2630 error_report("Invalid compressed data length: %d", len);
2631 ret = -EINVAL;
2632 break;
2634 decompress_data_with_multi_threads(f, host, len);
2635 break;
2637 case RAM_SAVE_FLAG_XBZRLE:
2638 if (load_xbzrle(f, addr, host) < 0) {
2639 error_report("Failed to decompress XBZRLE page at "
2640 RAM_ADDR_FMT, addr);
2641 ret = -EINVAL;
2642 break;
2644 break;
2645 case RAM_SAVE_FLAG_EOS:
2646 /* normal exit */
2647 break;
2648 default:
2649 if (flags & RAM_SAVE_FLAG_HOOK) {
2650 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
2651 } else {
2652 error_report("Unknown combination of migration flags: %#x",
2653 flags);
2654 ret = -EINVAL;
2657 if (!ret) {
2658 ret = qemu_file_get_error(f);
2662 wait_for_decompress_done();
2663 rcu_read_unlock();
2664 trace_ram_load_complete(ret, seq_iter);
2665 return ret;
2668 static SaveVMHandlers savevm_ram_handlers = {
2669 .save_live_setup = ram_save_setup,
2670 .save_live_iterate = ram_save_iterate,
2671 .save_live_complete_postcopy = ram_save_complete,
2672 .save_live_complete_precopy = ram_save_complete,
2673 .save_live_pending = ram_save_pending,
2674 .load_state = ram_load,
2675 .cleanup = ram_migration_cleanup,
2678 void ram_mig_init(void)
2680 qemu_mutex_init(&XBZRLE.lock);
2681 register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, &ram_state);