migration: Make cache_init() take an error parameter
[qemu/ar7.git] / migration / ram.c
blob47501460c8d867c57760c3ff232790e0b86c787b
1 /*
2 * QEMU System Emulator
4 * Copyright (c) 2003-2008 Fabrice Bellard
5 * Copyright (c) 2011-2015 Red Hat Inc
7 * Authors:
8 * Juan Quintela <quintela@redhat.com>
10 * Permission is hereby granted, free of charge, to any person obtaining a copy
11 * of this software and associated documentation files (the "Software"), to deal
12 * in the Software without restriction, including without limitation the rights
13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 * copies of the Software, and to permit persons to whom the Software is
15 * furnished to do so, subject to the following conditions:
17 * The above copyright notice and this permission notice shall be included in
18 * all copies or substantial portions of the Software.
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26 * THE SOFTWARE.
28 #include "qemu/osdep.h"
29 #include "cpu.h"
30 #include <zlib.h>
31 #include "qapi-event.h"
32 #include "qemu/cutils.h"
33 #include "qemu/bitops.h"
34 #include "qemu/bitmap.h"
35 #include "qemu/main-loop.h"
36 #include "xbzrle.h"
37 #include "ram.h"
38 #include "migration.h"
39 #include "migration/register.h"
40 #include "migration/misc.h"
41 #include "qemu-file.h"
42 #include "postcopy-ram.h"
43 #include "migration/page_cache.h"
44 #include "qemu/error-report.h"
45 #include "qapi/qmp/qerror.h"
46 #include "trace.h"
47 #include "exec/ram_addr.h"
48 #include "qemu/rcu_queue.h"
49 #include "migration/colo.h"
50 #include "migration/block.h"
52 /***********************************************************/
53 /* ram save/restore */
55 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
56 * worked for pages that where filled with the same char. We switched
57 * it to only search for the zero value. And to avoid confusion with
58 * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
61 #define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */
62 #define RAM_SAVE_FLAG_ZERO 0x02
63 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
64 #define RAM_SAVE_FLAG_PAGE 0x08
65 #define RAM_SAVE_FLAG_EOS 0x10
66 #define RAM_SAVE_FLAG_CONTINUE 0x20
67 #define RAM_SAVE_FLAG_XBZRLE 0x40
68 /* 0x80 is reserved in migration.h start with 0x100 next */
69 #define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100
71 static inline bool is_zero_range(uint8_t *p, uint64_t size)
73 return buffer_is_zero(p, size);
76 XBZRLECacheStats xbzrle_counters;
78 /* struct contains XBZRLE cache and a static page
79 used by the compression */
80 static struct {
81 /* buffer used for XBZRLE encoding */
82 uint8_t *encoded_buf;
83 /* buffer for storing page content */
84 uint8_t *current_buf;
85 /* Cache for XBZRLE, Protected by lock. */
86 PageCache *cache;
87 QemuMutex lock;
88 /* it will store a page full of zeros */
89 uint8_t *zero_target_page;
90 /* buffer used for XBZRLE decoding */
91 uint8_t *decoded_buf;
92 } XBZRLE;
94 static void XBZRLE_cache_lock(void)
96 if (migrate_use_xbzrle())
97 qemu_mutex_lock(&XBZRLE.lock);
100 static void XBZRLE_cache_unlock(void)
102 if (migrate_use_xbzrle())
103 qemu_mutex_unlock(&XBZRLE.lock);
107 * xbzrle_cache_resize: resize the xbzrle cache
109 * This function is called from qmp_migrate_set_cache_size in main
110 * thread, possibly while a migration is in progress. A running
111 * migration may be using the cache and might finish during this call,
112 * hence changes to the cache are protected by XBZRLE.lock().
114 * Returns the new_size or negative in case of error.
116 * @new_size: new cache size
117 * @errp: set *errp if the check failed, with reason
119 int64_t xbzrle_cache_resize(int64_t new_size, Error **errp)
121 PageCache *new_cache;
122 int64_t ret;
124 /* Check for truncation */
125 if (new_size != (size_t)new_size) {
126 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
127 "exceeding address space");
128 return -1;
131 /* Cache should not be larger than guest ram size */
132 if (new_size > ram_bytes_total()) {
133 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
134 "exceeds guest ram size");
135 return -1;
138 XBZRLE_cache_lock();
140 if (XBZRLE.cache != NULL) {
141 if (pow2floor(new_size) == migrate_xbzrle_cache_size()) {
142 goto out_new_size;
144 new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
145 if (!new_cache) {
146 ret = -1;
147 goto out;
150 cache_fini(XBZRLE.cache);
151 XBZRLE.cache = new_cache;
154 out_new_size:
155 ret = pow2floor(new_size);
156 out:
157 XBZRLE_cache_unlock();
158 return ret;
162 * An outstanding page request, on the source, having been received
163 * and queued
165 struct RAMSrcPageRequest {
166 RAMBlock *rb;
167 hwaddr offset;
168 hwaddr len;
170 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
173 /* State of RAM for migration */
174 struct RAMState {
175 /* QEMUFile used for this migration */
176 QEMUFile *f;
177 /* Last block that we have visited searching for dirty pages */
178 RAMBlock *last_seen_block;
179 /* Last block from where we have sent data */
180 RAMBlock *last_sent_block;
181 /* Last dirty target page we have sent */
182 ram_addr_t last_page;
183 /* last ram version we have seen */
184 uint32_t last_version;
185 /* We are in the first round */
186 bool ram_bulk_stage;
187 /* How many times we have dirty too many pages */
188 int dirty_rate_high_cnt;
189 /* these variables are used for bitmap sync */
190 /* last time we did a full bitmap_sync */
191 int64_t time_last_bitmap_sync;
192 /* bytes transferred at start_time */
193 uint64_t bytes_xfer_prev;
194 /* number of dirty pages since start_time */
195 uint64_t num_dirty_pages_period;
196 /* xbzrle misses since the beginning of the period */
197 uint64_t xbzrle_cache_miss_prev;
198 /* number of iterations at the beginning of period */
199 uint64_t iterations_prev;
200 /* Iterations since start */
201 uint64_t iterations;
202 /* number of dirty bits in the bitmap */
203 uint64_t migration_dirty_pages;
204 /* protects modification of the bitmap */
205 QemuMutex bitmap_mutex;
206 /* The RAMBlock used in the last src_page_requests */
207 RAMBlock *last_req_rb;
208 /* Queue of outstanding page requests from the destination */
209 QemuMutex src_page_req_mutex;
210 QSIMPLEQ_HEAD(src_page_requests, RAMSrcPageRequest) src_page_requests;
212 typedef struct RAMState RAMState;
214 static RAMState *ram_state;
216 uint64_t ram_bytes_remaining(void)
218 return ram_state->migration_dirty_pages * TARGET_PAGE_SIZE;
221 MigrationStats ram_counters;
223 /* used by the search for pages to send */
224 struct PageSearchStatus {
225 /* Current block being searched */
226 RAMBlock *block;
227 /* Current page to search from */
228 unsigned long page;
229 /* Set once we wrap around */
230 bool complete_round;
232 typedef struct PageSearchStatus PageSearchStatus;
234 struct CompressParam {
235 bool done;
236 bool quit;
237 QEMUFile *file;
238 QemuMutex mutex;
239 QemuCond cond;
240 RAMBlock *block;
241 ram_addr_t offset;
243 typedef struct CompressParam CompressParam;
245 struct DecompressParam {
246 bool done;
247 bool quit;
248 QemuMutex mutex;
249 QemuCond cond;
250 void *des;
251 uint8_t *compbuf;
252 int len;
254 typedef struct DecompressParam DecompressParam;
256 static CompressParam *comp_param;
257 static QemuThread *compress_threads;
258 /* comp_done_cond is used to wake up the migration thread when
259 * one of the compression threads has finished the compression.
260 * comp_done_lock is used to co-work with comp_done_cond.
262 static QemuMutex comp_done_lock;
263 static QemuCond comp_done_cond;
264 /* The empty QEMUFileOps will be used by file in CompressParam */
265 static const QEMUFileOps empty_ops = { };
267 static DecompressParam *decomp_param;
268 static QemuThread *decompress_threads;
269 static QemuMutex decomp_done_lock;
270 static QemuCond decomp_done_cond;
272 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
273 ram_addr_t offset);
275 static void *do_data_compress(void *opaque)
277 CompressParam *param = opaque;
278 RAMBlock *block;
279 ram_addr_t offset;
281 qemu_mutex_lock(&param->mutex);
282 while (!param->quit) {
283 if (param->block) {
284 block = param->block;
285 offset = param->offset;
286 param->block = NULL;
287 qemu_mutex_unlock(&param->mutex);
289 do_compress_ram_page(param->file, block, offset);
291 qemu_mutex_lock(&comp_done_lock);
292 param->done = true;
293 qemu_cond_signal(&comp_done_cond);
294 qemu_mutex_unlock(&comp_done_lock);
296 qemu_mutex_lock(&param->mutex);
297 } else {
298 qemu_cond_wait(&param->cond, &param->mutex);
301 qemu_mutex_unlock(&param->mutex);
303 return NULL;
306 static inline void terminate_compression_threads(void)
308 int idx, thread_count;
310 thread_count = migrate_compress_threads();
312 for (idx = 0; idx < thread_count; idx++) {
313 qemu_mutex_lock(&comp_param[idx].mutex);
314 comp_param[idx].quit = true;
315 qemu_cond_signal(&comp_param[idx].cond);
316 qemu_mutex_unlock(&comp_param[idx].mutex);
320 static void compress_threads_save_cleanup(void)
322 int i, thread_count;
324 if (!migrate_use_compression()) {
325 return;
327 terminate_compression_threads();
328 thread_count = migrate_compress_threads();
329 for (i = 0; i < thread_count; i++) {
330 qemu_thread_join(compress_threads + i);
331 qemu_fclose(comp_param[i].file);
332 qemu_mutex_destroy(&comp_param[i].mutex);
333 qemu_cond_destroy(&comp_param[i].cond);
335 qemu_mutex_destroy(&comp_done_lock);
336 qemu_cond_destroy(&comp_done_cond);
337 g_free(compress_threads);
338 g_free(comp_param);
339 compress_threads = NULL;
340 comp_param = NULL;
343 static void compress_threads_save_setup(void)
345 int i, thread_count;
347 if (!migrate_use_compression()) {
348 return;
350 thread_count = migrate_compress_threads();
351 compress_threads = g_new0(QemuThread, thread_count);
352 comp_param = g_new0(CompressParam, thread_count);
353 qemu_cond_init(&comp_done_cond);
354 qemu_mutex_init(&comp_done_lock);
355 for (i = 0; i < thread_count; i++) {
356 /* comp_param[i].file is just used as a dummy buffer to save data,
357 * set its ops to empty.
359 comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
360 comp_param[i].done = true;
361 comp_param[i].quit = false;
362 qemu_mutex_init(&comp_param[i].mutex);
363 qemu_cond_init(&comp_param[i].cond);
364 qemu_thread_create(compress_threads + i, "compress",
365 do_data_compress, comp_param + i,
366 QEMU_THREAD_JOINABLE);
370 /* Multiple fd's */
372 struct MultiFDSendParams {
373 uint8_t id;
374 char *name;
375 QemuThread thread;
376 QemuSemaphore sem;
377 QemuMutex mutex;
378 bool quit;
380 typedef struct MultiFDSendParams MultiFDSendParams;
382 struct {
383 MultiFDSendParams *params;
384 /* number of created threads */
385 int count;
386 } *multifd_send_state;
388 static void terminate_multifd_send_threads(Error *errp)
390 int i;
392 for (i = 0; i < multifd_send_state->count; i++) {
393 MultiFDSendParams *p = &multifd_send_state->params[i];
395 qemu_mutex_lock(&p->mutex);
396 p->quit = true;
397 qemu_sem_post(&p->sem);
398 qemu_mutex_unlock(&p->mutex);
402 int multifd_save_cleanup(Error **errp)
404 int i;
405 int ret = 0;
407 if (!migrate_use_multifd()) {
408 return 0;
410 terminate_multifd_send_threads(NULL);
411 for (i = 0; i < multifd_send_state->count; i++) {
412 MultiFDSendParams *p = &multifd_send_state->params[i];
414 qemu_thread_join(&p->thread);
415 qemu_mutex_destroy(&p->mutex);
416 qemu_sem_destroy(&p->sem);
417 g_free(p->name);
418 p->name = NULL;
420 g_free(multifd_send_state->params);
421 multifd_send_state->params = NULL;
422 g_free(multifd_send_state);
423 multifd_send_state = NULL;
424 return ret;
427 static void *multifd_send_thread(void *opaque)
429 MultiFDSendParams *p = opaque;
431 while (true) {
432 qemu_mutex_lock(&p->mutex);
433 if (p->quit) {
434 qemu_mutex_unlock(&p->mutex);
435 break;
437 qemu_mutex_unlock(&p->mutex);
438 qemu_sem_wait(&p->sem);
441 return NULL;
444 int multifd_save_setup(void)
446 int thread_count;
447 uint8_t i;
449 if (!migrate_use_multifd()) {
450 return 0;
452 thread_count = migrate_multifd_channels();
453 multifd_send_state = g_malloc0(sizeof(*multifd_send_state));
454 multifd_send_state->params = g_new0(MultiFDSendParams, thread_count);
455 multifd_send_state->count = 0;
456 for (i = 0; i < thread_count; i++) {
457 MultiFDSendParams *p = &multifd_send_state->params[i];
459 qemu_mutex_init(&p->mutex);
460 qemu_sem_init(&p->sem, 0);
461 p->quit = false;
462 p->id = i;
463 p->name = g_strdup_printf("multifdsend_%d", i);
464 qemu_thread_create(&p->thread, p->name, multifd_send_thread, p,
465 QEMU_THREAD_JOINABLE);
467 multifd_send_state->count++;
469 return 0;
472 struct MultiFDRecvParams {
473 uint8_t id;
474 char *name;
475 QemuThread thread;
476 QemuSemaphore sem;
477 QemuMutex mutex;
478 bool quit;
480 typedef struct MultiFDRecvParams MultiFDRecvParams;
482 struct {
483 MultiFDRecvParams *params;
484 /* number of created threads */
485 int count;
486 } *multifd_recv_state;
488 static void terminate_multifd_recv_threads(Error *errp)
490 int i;
492 for (i = 0; i < multifd_recv_state->count; i++) {
493 MultiFDRecvParams *p = &multifd_recv_state->params[i];
495 qemu_mutex_lock(&p->mutex);
496 p->quit = true;
497 qemu_sem_post(&p->sem);
498 qemu_mutex_unlock(&p->mutex);
502 int multifd_load_cleanup(Error **errp)
504 int i;
505 int ret = 0;
507 if (!migrate_use_multifd()) {
508 return 0;
510 terminate_multifd_recv_threads(NULL);
511 for (i = 0; i < multifd_recv_state->count; i++) {
512 MultiFDRecvParams *p = &multifd_recv_state->params[i];
514 qemu_thread_join(&p->thread);
515 qemu_mutex_destroy(&p->mutex);
516 qemu_sem_destroy(&p->sem);
517 g_free(p->name);
518 p->name = NULL;
520 g_free(multifd_recv_state->params);
521 multifd_recv_state->params = NULL;
522 g_free(multifd_recv_state);
523 multifd_recv_state = NULL;
525 return ret;
528 static void *multifd_recv_thread(void *opaque)
530 MultiFDRecvParams *p = opaque;
532 while (true) {
533 qemu_mutex_lock(&p->mutex);
534 if (p->quit) {
535 qemu_mutex_unlock(&p->mutex);
536 break;
538 qemu_mutex_unlock(&p->mutex);
539 qemu_sem_wait(&p->sem);
542 return NULL;
545 int multifd_load_setup(void)
547 int thread_count;
548 uint8_t i;
550 if (!migrate_use_multifd()) {
551 return 0;
553 thread_count = migrate_multifd_channels();
554 multifd_recv_state = g_malloc0(sizeof(*multifd_recv_state));
555 multifd_recv_state->params = g_new0(MultiFDRecvParams, thread_count);
556 multifd_recv_state->count = 0;
557 for (i = 0; i < thread_count; i++) {
558 MultiFDRecvParams *p = &multifd_recv_state->params[i];
560 qemu_mutex_init(&p->mutex);
561 qemu_sem_init(&p->sem, 0);
562 p->quit = false;
563 p->id = i;
564 p->name = g_strdup_printf("multifdrecv_%d", i);
565 qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p,
566 QEMU_THREAD_JOINABLE);
567 multifd_recv_state->count++;
569 return 0;
573 * save_page_header: write page header to wire
575 * If this is the 1st block, it also writes the block identification
577 * Returns the number of bytes written
579 * @f: QEMUFile where to send the data
580 * @block: block that contains the page we want to send
581 * @offset: offset inside the block for the page
582 * in the lower bits, it contains flags
584 static size_t save_page_header(RAMState *rs, QEMUFile *f, RAMBlock *block,
585 ram_addr_t offset)
587 size_t size, len;
589 if (block == rs->last_sent_block) {
590 offset |= RAM_SAVE_FLAG_CONTINUE;
592 qemu_put_be64(f, offset);
593 size = 8;
595 if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
596 len = strlen(block->idstr);
597 qemu_put_byte(f, len);
598 qemu_put_buffer(f, (uint8_t *)block->idstr, len);
599 size += 1 + len;
600 rs->last_sent_block = block;
602 return size;
606 * mig_throttle_guest_down: throotle down the guest
608 * Reduce amount of guest cpu execution to hopefully slow down memory
609 * writes. If guest dirty memory rate is reduced below the rate at
610 * which we can transfer pages to the destination then we should be
611 * able to complete migration. Some workloads dirty memory way too
612 * fast and will not effectively converge, even with auto-converge.
614 static void mig_throttle_guest_down(void)
616 MigrationState *s = migrate_get_current();
617 uint64_t pct_initial = s->parameters.cpu_throttle_initial;
618 uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
620 /* We have not started throttling yet. Let's start it. */
621 if (!cpu_throttle_active()) {
622 cpu_throttle_set(pct_initial);
623 } else {
624 /* Throttling already on, just increase the rate */
625 cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement);
630 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
632 * @rs: current RAM state
633 * @current_addr: address for the zero page
635 * Update the xbzrle cache to reflect a page that's been sent as all 0.
636 * The important thing is that a stale (not-yet-0'd) page be replaced
637 * by the new data.
638 * As a bonus, if the page wasn't in the cache it gets added so that
639 * when a small write is made into the 0'd page it gets XBZRLE sent.
641 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
643 if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
644 return;
647 /* We don't care if this fails to allocate a new cache page
648 * as long as it updated an old one */
649 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
650 ram_counters.dirty_sync_count);
653 #define ENCODING_FLAG_XBZRLE 0x1
656 * save_xbzrle_page: compress and send current page
658 * Returns: 1 means that we wrote the page
659 * 0 means that page is identical to the one already sent
660 * -1 means that xbzrle would be longer than normal
662 * @rs: current RAM state
663 * @current_data: pointer to the address of the page contents
664 * @current_addr: addr of the page
665 * @block: block that contains the page we want to send
666 * @offset: offset inside the block for the page
667 * @last_stage: if we are at the completion stage
669 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
670 ram_addr_t current_addr, RAMBlock *block,
671 ram_addr_t offset, bool last_stage)
673 int encoded_len = 0, bytes_xbzrle;
674 uint8_t *prev_cached_page;
676 if (!cache_is_cached(XBZRLE.cache, current_addr,
677 ram_counters.dirty_sync_count)) {
678 xbzrle_counters.cache_miss++;
679 if (!last_stage) {
680 if (cache_insert(XBZRLE.cache, current_addr, *current_data,
681 ram_counters.dirty_sync_count) == -1) {
682 return -1;
683 } else {
684 /* update *current_data when the page has been
685 inserted into cache */
686 *current_data = get_cached_data(XBZRLE.cache, current_addr);
689 return -1;
692 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
694 /* save current buffer into memory */
695 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
697 /* XBZRLE encoding (if there is no overflow) */
698 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
699 TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
700 TARGET_PAGE_SIZE);
701 if (encoded_len == 0) {
702 trace_save_xbzrle_page_skipping();
703 return 0;
704 } else if (encoded_len == -1) {
705 trace_save_xbzrle_page_overflow();
706 xbzrle_counters.overflow++;
707 /* update data in the cache */
708 if (!last_stage) {
709 memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
710 *current_data = prev_cached_page;
712 return -1;
715 /* we need to update the data in the cache, in order to get the same data */
716 if (!last_stage) {
717 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
720 /* Send XBZRLE based compressed page */
721 bytes_xbzrle = save_page_header(rs, rs->f, block,
722 offset | RAM_SAVE_FLAG_XBZRLE);
723 qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
724 qemu_put_be16(rs->f, encoded_len);
725 qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
726 bytes_xbzrle += encoded_len + 1 + 2;
727 xbzrle_counters.pages++;
728 xbzrle_counters.bytes += bytes_xbzrle;
729 ram_counters.transferred += bytes_xbzrle;
731 return 1;
735 * migration_bitmap_find_dirty: find the next dirty page from start
737 * Called with rcu_read_lock() to protect migration_bitmap
739 * Returns the byte offset within memory region of the start of a dirty page
741 * @rs: current RAM state
742 * @rb: RAMBlock where to search for dirty pages
743 * @start: page where we start the search
745 static inline
746 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
747 unsigned long start)
749 unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
750 unsigned long *bitmap = rb->bmap;
751 unsigned long next;
753 if (rs->ram_bulk_stage && start > 0) {
754 next = start + 1;
755 } else {
756 next = find_next_bit(bitmap, size, start);
759 return next;
762 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
763 RAMBlock *rb,
764 unsigned long page)
766 bool ret;
768 ret = test_and_clear_bit(page, rb->bmap);
770 if (ret) {
771 rs->migration_dirty_pages--;
773 return ret;
776 static void migration_bitmap_sync_range(RAMState *rs, RAMBlock *rb,
777 ram_addr_t start, ram_addr_t length)
779 rs->migration_dirty_pages +=
780 cpu_physical_memory_sync_dirty_bitmap(rb, start, length,
781 &rs->num_dirty_pages_period);
785 * ram_pagesize_summary: calculate all the pagesizes of a VM
787 * Returns a summary bitmap of the page sizes of all RAMBlocks
789 * For VMs with just normal pages this is equivalent to the host page
790 * size. If it's got some huge pages then it's the OR of all the
791 * different page sizes.
793 uint64_t ram_pagesize_summary(void)
795 RAMBlock *block;
796 uint64_t summary = 0;
798 RAMBLOCK_FOREACH(block) {
799 summary |= block->page_size;
802 return summary;
805 static void migration_bitmap_sync(RAMState *rs)
807 RAMBlock *block;
808 int64_t end_time;
809 uint64_t bytes_xfer_now;
811 ram_counters.dirty_sync_count++;
813 if (!rs->time_last_bitmap_sync) {
814 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
817 trace_migration_bitmap_sync_start();
818 memory_global_dirty_log_sync();
820 qemu_mutex_lock(&rs->bitmap_mutex);
821 rcu_read_lock();
822 RAMBLOCK_FOREACH(block) {
823 migration_bitmap_sync_range(rs, block, 0, block->used_length);
825 rcu_read_unlock();
826 qemu_mutex_unlock(&rs->bitmap_mutex);
828 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
830 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
832 /* more than 1 second = 1000 millisecons */
833 if (end_time > rs->time_last_bitmap_sync + 1000) {
834 /* calculate period counters */
835 ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
836 / (end_time - rs->time_last_bitmap_sync);
837 bytes_xfer_now = ram_counters.transferred;
839 /* During block migration the auto-converge logic incorrectly detects
840 * that ram migration makes no progress. Avoid this by disabling the
841 * throttling logic during the bulk phase of block migration. */
842 if (migrate_auto_converge() && !blk_mig_bulk_active()) {
843 /* The following detection logic can be refined later. For now:
844 Check to see if the dirtied bytes is 50% more than the approx.
845 amount of bytes that just got transferred since the last time we
846 were in this routine. If that happens twice, start or increase
847 throttling */
849 if ((rs->num_dirty_pages_period * TARGET_PAGE_SIZE >
850 (bytes_xfer_now - rs->bytes_xfer_prev) / 2) &&
851 (++rs->dirty_rate_high_cnt >= 2)) {
852 trace_migration_throttle();
853 rs->dirty_rate_high_cnt = 0;
854 mig_throttle_guest_down();
858 if (migrate_use_xbzrle()) {
859 if (rs->iterations_prev != rs->iterations) {
860 xbzrle_counters.cache_miss_rate =
861 (double)(xbzrle_counters.cache_miss -
862 rs->xbzrle_cache_miss_prev) /
863 (rs->iterations - rs->iterations_prev);
865 rs->iterations_prev = rs->iterations;
866 rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
869 /* reset period counters */
870 rs->time_last_bitmap_sync = end_time;
871 rs->num_dirty_pages_period = 0;
872 rs->bytes_xfer_prev = bytes_xfer_now;
874 if (migrate_use_events()) {
875 qapi_event_send_migration_pass(ram_counters.dirty_sync_count, NULL);
880 * save_zero_page: send the zero page to the stream
882 * Returns the number of pages written.
884 * @rs: current RAM state
885 * @block: block that contains the page we want to send
886 * @offset: offset inside the block for the page
887 * @p: pointer to the page
889 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
890 uint8_t *p)
892 int pages = -1;
894 if (is_zero_range(p, TARGET_PAGE_SIZE)) {
895 ram_counters.duplicate++;
896 ram_counters.transferred +=
897 save_page_header(rs, rs->f, block, offset | RAM_SAVE_FLAG_ZERO);
898 qemu_put_byte(rs->f, 0);
899 ram_counters.transferred += 1;
900 pages = 1;
903 return pages;
906 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
908 if (!migrate_release_ram() || !migration_in_postcopy()) {
909 return;
912 ram_discard_range(rbname, offset, pages << TARGET_PAGE_BITS);
916 * ram_save_page: send the given page to the stream
918 * Returns the number of pages written.
919 * < 0 - error
920 * >=0 - Number of pages written - this might legally be 0
921 * if xbzrle noticed the page was the same.
923 * @rs: current RAM state
924 * @block: block that contains the page we want to send
925 * @offset: offset inside the block for the page
926 * @last_stage: if we are at the completion stage
928 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
930 int pages = -1;
931 uint64_t bytes_xmit;
932 ram_addr_t current_addr;
933 uint8_t *p;
934 int ret;
935 bool send_async = true;
936 RAMBlock *block = pss->block;
937 ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
939 p = block->host + offset;
940 trace_ram_save_page(block->idstr, (uint64_t)offset, p);
942 /* In doubt sent page as normal */
943 bytes_xmit = 0;
944 ret = ram_control_save_page(rs->f, block->offset,
945 offset, TARGET_PAGE_SIZE, &bytes_xmit);
946 if (bytes_xmit) {
947 ram_counters.transferred += bytes_xmit;
948 pages = 1;
951 XBZRLE_cache_lock();
953 current_addr = block->offset + offset;
955 if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
956 if (ret != RAM_SAVE_CONTROL_DELAYED) {
957 if (bytes_xmit > 0) {
958 ram_counters.normal++;
959 } else if (bytes_xmit == 0) {
960 ram_counters.duplicate++;
963 } else {
964 pages = save_zero_page(rs, block, offset, p);
965 if (pages > 0) {
966 /* Must let xbzrle know, otherwise a previous (now 0'd) cached
967 * page would be stale
969 xbzrle_cache_zero_page(rs, current_addr);
970 ram_release_pages(block->idstr, offset, pages);
971 } else if (!rs->ram_bulk_stage &&
972 !migration_in_postcopy() && migrate_use_xbzrle()) {
973 pages = save_xbzrle_page(rs, &p, current_addr, block,
974 offset, last_stage);
975 if (!last_stage) {
976 /* Can't send this cached data async, since the cache page
977 * might get updated before it gets to the wire
979 send_async = false;
984 /* XBZRLE overflow or normal page */
985 if (pages == -1) {
986 ram_counters.transferred +=
987 save_page_header(rs, rs->f, block, offset | RAM_SAVE_FLAG_PAGE);
988 if (send_async) {
989 qemu_put_buffer_async(rs->f, p, TARGET_PAGE_SIZE,
990 migrate_release_ram() &
991 migration_in_postcopy());
992 } else {
993 qemu_put_buffer(rs->f, p, TARGET_PAGE_SIZE);
995 ram_counters.transferred += TARGET_PAGE_SIZE;
996 pages = 1;
997 ram_counters.normal++;
1000 XBZRLE_cache_unlock();
1002 return pages;
1005 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
1006 ram_addr_t offset)
1008 RAMState *rs = ram_state;
1009 int bytes_sent, blen;
1010 uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
1012 bytes_sent = save_page_header(rs, f, block, offset |
1013 RAM_SAVE_FLAG_COMPRESS_PAGE);
1014 blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE,
1015 migrate_compress_level());
1016 if (blen < 0) {
1017 bytes_sent = 0;
1018 qemu_file_set_error(migrate_get_current()->to_dst_file, blen);
1019 error_report("compressed data failed!");
1020 } else {
1021 bytes_sent += blen;
1022 ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
1025 return bytes_sent;
1028 static void flush_compressed_data(RAMState *rs)
1030 int idx, len, thread_count;
1032 if (!migrate_use_compression()) {
1033 return;
1035 thread_count = migrate_compress_threads();
1037 qemu_mutex_lock(&comp_done_lock);
1038 for (idx = 0; idx < thread_count; idx++) {
1039 while (!comp_param[idx].done) {
1040 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1043 qemu_mutex_unlock(&comp_done_lock);
1045 for (idx = 0; idx < thread_count; idx++) {
1046 qemu_mutex_lock(&comp_param[idx].mutex);
1047 if (!comp_param[idx].quit) {
1048 len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1049 ram_counters.transferred += len;
1051 qemu_mutex_unlock(&comp_param[idx].mutex);
1055 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1056 ram_addr_t offset)
1058 param->block = block;
1059 param->offset = offset;
1062 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1063 ram_addr_t offset)
1065 int idx, thread_count, bytes_xmit = -1, pages = -1;
1067 thread_count = migrate_compress_threads();
1068 qemu_mutex_lock(&comp_done_lock);
1069 while (true) {
1070 for (idx = 0; idx < thread_count; idx++) {
1071 if (comp_param[idx].done) {
1072 comp_param[idx].done = false;
1073 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1074 qemu_mutex_lock(&comp_param[idx].mutex);
1075 set_compress_params(&comp_param[idx], block, offset);
1076 qemu_cond_signal(&comp_param[idx].cond);
1077 qemu_mutex_unlock(&comp_param[idx].mutex);
1078 pages = 1;
1079 ram_counters.normal++;
1080 ram_counters.transferred += bytes_xmit;
1081 break;
1084 if (pages > 0) {
1085 break;
1086 } else {
1087 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1090 qemu_mutex_unlock(&comp_done_lock);
1092 return pages;
1096 * ram_save_compressed_page: compress the given page and send it to the stream
1098 * Returns the number of pages written.
1100 * @rs: current RAM state
1101 * @block: block that contains the page we want to send
1102 * @offset: offset inside the block for the page
1103 * @last_stage: if we are at the completion stage
1105 static int ram_save_compressed_page(RAMState *rs, PageSearchStatus *pss,
1106 bool last_stage)
1108 int pages = -1;
1109 uint64_t bytes_xmit = 0;
1110 uint8_t *p;
1111 int ret, blen;
1112 RAMBlock *block = pss->block;
1113 ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
1115 p = block->host + offset;
1117 ret = ram_control_save_page(rs->f, block->offset,
1118 offset, TARGET_PAGE_SIZE, &bytes_xmit);
1119 if (bytes_xmit) {
1120 ram_counters.transferred += bytes_xmit;
1121 pages = 1;
1123 if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
1124 if (ret != RAM_SAVE_CONTROL_DELAYED) {
1125 if (bytes_xmit > 0) {
1126 ram_counters.normal++;
1127 } else if (bytes_xmit == 0) {
1128 ram_counters.duplicate++;
1131 } else {
1132 /* When starting the process of a new block, the first page of
1133 * the block should be sent out before other pages in the same
1134 * block, and all the pages in last block should have been sent
1135 * out, keeping this order is important, because the 'cont' flag
1136 * is used to avoid resending the block name.
1138 if (block != rs->last_sent_block) {
1139 flush_compressed_data(rs);
1140 pages = save_zero_page(rs, block, offset, p);
1141 if (pages == -1) {
1142 /* Make sure the first page is sent out before other pages */
1143 bytes_xmit = save_page_header(rs, rs->f, block, offset |
1144 RAM_SAVE_FLAG_COMPRESS_PAGE);
1145 blen = qemu_put_compression_data(rs->f, p, TARGET_PAGE_SIZE,
1146 migrate_compress_level());
1147 if (blen > 0) {
1148 ram_counters.transferred += bytes_xmit + blen;
1149 ram_counters.normal++;
1150 pages = 1;
1151 } else {
1152 qemu_file_set_error(rs->f, blen);
1153 error_report("compressed data failed!");
1156 if (pages > 0) {
1157 ram_release_pages(block->idstr, offset, pages);
1159 } else {
1160 pages = save_zero_page(rs, block, offset, p);
1161 if (pages == -1) {
1162 pages = compress_page_with_multi_thread(rs, block, offset);
1163 } else {
1164 ram_release_pages(block->idstr, offset, pages);
1169 return pages;
1173 * find_dirty_block: find the next dirty page and update any state
1174 * associated with the search process.
1176 * Returns if a page is found
1178 * @rs: current RAM state
1179 * @pss: data about the state of the current dirty page scan
1180 * @again: set to false if the search has scanned the whole of RAM
1182 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1184 pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1185 if (pss->complete_round && pss->block == rs->last_seen_block &&
1186 pss->page >= rs->last_page) {
1188 * We've been once around the RAM and haven't found anything.
1189 * Give up.
1191 *again = false;
1192 return false;
1194 if ((pss->page << TARGET_PAGE_BITS) >= pss->block->used_length) {
1195 /* Didn't find anything in this RAM Block */
1196 pss->page = 0;
1197 pss->block = QLIST_NEXT_RCU(pss->block, next);
1198 if (!pss->block) {
1199 /* Hit the end of the list */
1200 pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1201 /* Flag that we've looped */
1202 pss->complete_round = true;
1203 rs->ram_bulk_stage = false;
1204 if (migrate_use_xbzrle()) {
1205 /* If xbzrle is on, stop using the data compression at this
1206 * point. In theory, xbzrle can do better than compression.
1208 flush_compressed_data(rs);
1211 /* Didn't find anything this time, but try again on the new block */
1212 *again = true;
1213 return false;
1214 } else {
1215 /* Can go around again, but... */
1216 *again = true;
1217 /* We've found something so probably don't need to */
1218 return true;
1223 * unqueue_page: gets a page of the queue
1225 * Helper for 'get_queued_page' - gets a page off the queue
1227 * Returns the block of the page (or NULL if none available)
1229 * @rs: current RAM state
1230 * @offset: used to return the offset within the RAMBlock
1232 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1234 RAMBlock *block = NULL;
1236 qemu_mutex_lock(&rs->src_page_req_mutex);
1237 if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1238 struct RAMSrcPageRequest *entry =
1239 QSIMPLEQ_FIRST(&rs->src_page_requests);
1240 block = entry->rb;
1241 *offset = entry->offset;
1243 if (entry->len > TARGET_PAGE_SIZE) {
1244 entry->len -= TARGET_PAGE_SIZE;
1245 entry->offset += TARGET_PAGE_SIZE;
1246 } else {
1247 memory_region_unref(block->mr);
1248 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1249 g_free(entry);
1252 qemu_mutex_unlock(&rs->src_page_req_mutex);
1254 return block;
1258 * get_queued_page: unqueue a page from the postocpy requests
1260 * Skips pages that are already sent (!dirty)
1262 * Returns if a queued page is found
1264 * @rs: current RAM state
1265 * @pss: data about the state of the current dirty page scan
1267 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1269 RAMBlock *block;
1270 ram_addr_t offset;
1271 bool dirty;
1273 do {
1274 block = unqueue_page(rs, &offset);
1276 * We're sending this page, and since it's postcopy nothing else
1277 * will dirty it, and we must make sure it doesn't get sent again
1278 * even if this queue request was received after the background
1279 * search already sent it.
1281 if (block) {
1282 unsigned long page;
1284 page = offset >> TARGET_PAGE_BITS;
1285 dirty = test_bit(page, block->bmap);
1286 if (!dirty) {
1287 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1288 page, test_bit(page, block->unsentmap));
1289 } else {
1290 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1294 } while (block && !dirty);
1296 if (block) {
1298 * As soon as we start servicing pages out of order, then we have
1299 * to kill the bulk stage, since the bulk stage assumes
1300 * in (migration_bitmap_find_and_reset_dirty) that every page is
1301 * dirty, that's no longer true.
1303 rs->ram_bulk_stage = false;
1306 * We want the background search to continue from the queued page
1307 * since the guest is likely to want other pages near to the page
1308 * it just requested.
1310 pss->block = block;
1311 pss->page = offset >> TARGET_PAGE_BITS;
1314 return !!block;
1318 * migration_page_queue_free: drop any remaining pages in the ram
1319 * request queue
1321 * It should be empty at the end anyway, but in error cases there may
1322 * be some left. in case that there is any page left, we drop it.
1325 static void migration_page_queue_free(RAMState *rs)
1327 struct RAMSrcPageRequest *mspr, *next_mspr;
1328 /* This queue generally should be empty - but in the case of a failed
1329 * migration might have some droppings in.
1331 rcu_read_lock();
1332 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1333 memory_region_unref(mspr->rb->mr);
1334 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1335 g_free(mspr);
1337 rcu_read_unlock();
1341 * ram_save_queue_pages: queue the page for transmission
1343 * A request from postcopy destination for example.
1345 * Returns zero on success or negative on error
1347 * @rbname: Name of the RAMBLock of the request. NULL means the
1348 * same that last one.
1349 * @start: starting address from the start of the RAMBlock
1350 * @len: length (in bytes) to send
1352 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
1354 RAMBlock *ramblock;
1355 RAMState *rs = ram_state;
1357 ram_counters.postcopy_requests++;
1358 rcu_read_lock();
1359 if (!rbname) {
1360 /* Reuse last RAMBlock */
1361 ramblock = rs->last_req_rb;
1363 if (!ramblock) {
1365 * Shouldn't happen, we can't reuse the last RAMBlock if
1366 * it's the 1st request.
1368 error_report("ram_save_queue_pages no previous block");
1369 goto err;
1371 } else {
1372 ramblock = qemu_ram_block_by_name(rbname);
1374 if (!ramblock) {
1375 /* We shouldn't be asked for a non-existent RAMBlock */
1376 error_report("ram_save_queue_pages no block '%s'", rbname);
1377 goto err;
1379 rs->last_req_rb = ramblock;
1381 trace_ram_save_queue_pages(ramblock->idstr, start, len);
1382 if (start+len > ramblock->used_length) {
1383 error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1384 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1385 __func__, start, len, ramblock->used_length);
1386 goto err;
1389 struct RAMSrcPageRequest *new_entry =
1390 g_malloc0(sizeof(struct RAMSrcPageRequest));
1391 new_entry->rb = ramblock;
1392 new_entry->offset = start;
1393 new_entry->len = len;
1395 memory_region_ref(ramblock->mr);
1396 qemu_mutex_lock(&rs->src_page_req_mutex);
1397 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
1398 qemu_mutex_unlock(&rs->src_page_req_mutex);
1399 rcu_read_unlock();
1401 return 0;
1403 err:
1404 rcu_read_unlock();
1405 return -1;
1409 * ram_save_target_page: save one target page
1411 * Returns the number of pages written
1413 * @rs: current RAM state
1414 * @ms: current migration state
1415 * @pss: data about the page we want to send
1416 * @last_stage: if we are at the completion stage
1418 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
1419 bool last_stage)
1421 int res = 0;
1423 /* Check the pages is dirty and if it is send it */
1424 if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
1426 * If xbzrle is on, stop using the data compression after first
1427 * round of migration even if compression is enabled. In theory,
1428 * xbzrle can do better than compression.
1430 if (migrate_use_compression() &&
1431 (rs->ram_bulk_stage || !migrate_use_xbzrle())) {
1432 res = ram_save_compressed_page(rs, pss, last_stage);
1433 } else {
1434 res = ram_save_page(rs, pss, last_stage);
1437 if (res < 0) {
1438 return res;
1440 if (pss->block->unsentmap) {
1441 clear_bit(pss->page, pss->block->unsentmap);
1445 return res;
1449 * ram_save_host_page: save a whole host page
1451 * Starting at *offset send pages up to the end of the current host
1452 * page. It's valid for the initial offset to point into the middle of
1453 * a host page in which case the remainder of the hostpage is sent.
1454 * Only dirty target pages are sent. Note that the host page size may
1455 * be a huge page for this block.
1456 * The saving stops at the boundary of the used_length of the block
1457 * if the RAMBlock isn't a multiple of the host page size.
1459 * Returns the number of pages written or negative on error
1461 * @rs: current RAM state
1462 * @ms: current migration state
1463 * @pss: data about the page we want to send
1464 * @last_stage: if we are at the completion stage
1466 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
1467 bool last_stage)
1469 int tmppages, pages = 0;
1470 size_t pagesize_bits =
1471 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
1473 do {
1474 tmppages = ram_save_target_page(rs, pss, last_stage);
1475 if (tmppages < 0) {
1476 return tmppages;
1479 pages += tmppages;
1480 pss->page++;
1481 } while ((pss->page & (pagesize_bits - 1)) &&
1482 offset_in_ramblock(pss->block, pss->page << TARGET_PAGE_BITS));
1484 /* The offset we leave with is the last one we looked at */
1485 pss->page--;
1486 return pages;
1490 * ram_find_and_save_block: finds a dirty page and sends it to f
1492 * Called within an RCU critical section.
1494 * Returns the number of pages written where zero means no dirty pages
1496 * @rs: current RAM state
1497 * @last_stage: if we are at the completion stage
1499 * On systems where host-page-size > target-page-size it will send all the
1500 * pages in a host page that are dirty.
1503 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
1505 PageSearchStatus pss;
1506 int pages = 0;
1507 bool again, found;
1509 /* No dirty page as there is zero RAM */
1510 if (!ram_bytes_total()) {
1511 return pages;
1514 pss.block = rs->last_seen_block;
1515 pss.page = rs->last_page;
1516 pss.complete_round = false;
1518 if (!pss.block) {
1519 pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1522 do {
1523 again = true;
1524 found = get_queued_page(rs, &pss);
1526 if (!found) {
1527 /* priority queue empty, so just search for something dirty */
1528 found = find_dirty_block(rs, &pss, &again);
1531 if (found) {
1532 pages = ram_save_host_page(rs, &pss, last_stage);
1534 } while (!pages && again);
1536 rs->last_seen_block = pss.block;
1537 rs->last_page = pss.page;
1539 return pages;
1542 void acct_update_position(QEMUFile *f, size_t size, bool zero)
1544 uint64_t pages = size / TARGET_PAGE_SIZE;
1546 if (zero) {
1547 ram_counters.duplicate += pages;
1548 } else {
1549 ram_counters.normal += pages;
1550 ram_counters.transferred += size;
1551 qemu_update_position(f, size);
1555 uint64_t ram_bytes_total(void)
1557 RAMBlock *block;
1558 uint64_t total = 0;
1560 rcu_read_lock();
1561 RAMBLOCK_FOREACH(block) {
1562 total += block->used_length;
1564 rcu_read_unlock();
1565 return total;
1568 static void xbzrle_load_setup(void)
1570 XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
1573 static void xbzrle_load_cleanup(void)
1575 g_free(XBZRLE.decoded_buf);
1576 XBZRLE.decoded_buf = NULL;
1579 static void ram_save_cleanup(void *opaque)
1581 RAMState **rsp = opaque;
1582 RAMBlock *block;
1584 /* caller have hold iothread lock or is in a bh, so there is
1585 * no writing race against this migration_bitmap
1587 memory_global_dirty_log_stop();
1589 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1590 g_free(block->bmap);
1591 block->bmap = NULL;
1592 g_free(block->unsentmap);
1593 block->unsentmap = NULL;
1596 XBZRLE_cache_lock();
1597 if (XBZRLE.cache) {
1598 cache_fini(XBZRLE.cache);
1599 g_free(XBZRLE.encoded_buf);
1600 g_free(XBZRLE.current_buf);
1601 g_free(XBZRLE.zero_target_page);
1602 XBZRLE.cache = NULL;
1603 XBZRLE.encoded_buf = NULL;
1604 XBZRLE.current_buf = NULL;
1605 XBZRLE.zero_target_page = NULL;
1607 XBZRLE_cache_unlock();
1608 migration_page_queue_free(*rsp);
1609 compress_threads_save_cleanup();
1610 g_free(*rsp);
1611 *rsp = NULL;
1614 static void ram_state_reset(RAMState *rs)
1616 rs->last_seen_block = NULL;
1617 rs->last_sent_block = NULL;
1618 rs->last_page = 0;
1619 rs->last_version = ram_list.version;
1620 rs->ram_bulk_stage = true;
1623 #define MAX_WAIT 50 /* ms, half buffered_file limit */
1626 * 'expected' is the value you expect the bitmap mostly to be full
1627 * of; it won't bother printing lines that are all this value.
1628 * If 'todump' is null the migration bitmap is dumped.
1630 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
1631 unsigned long pages)
1633 int64_t cur;
1634 int64_t linelen = 128;
1635 char linebuf[129];
1637 for (cur = 0; cur < pages; cur += linelen) {
1638 int64_t curb;
1639 bool found = false;
1641 * Last line; catch the case where the line length
1642 * is longer than remaining ram
1644 if (cur + linelen > pages) {
1645 linelen = pages - cur;
1647 for (curb = 0; curb < linelen; curb++) {
1648 bool thisbit = test_bit(cur + curb, todump);
1649 linebuf[curb] = thisbit ? '1' : '.';
1650 found = found || (thisbit != expected);
1652 if (found) {
1653 linebuf[curb] = '\0';
1654 fprintf(stderr, "0x%08" PRIx64 " : %s\n", cur, linebuf);
1659 /* **** functions for postcopy ***** */
1661 void ram_postcopy_migrated_memory_release(MigrationState *ms)
1663 struct RAMBlock *block;
1665 RAMBLOCK_FOREACH(block) {
1666 unsigned long *bitmap = block->bmap;
1667 unsigned long range = block->used_length >> TARGET_PAGE_BITS;
1668 unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
1670 while (run_start < range) {
1671 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
1672 ram_discard_range(block->idstr, run_start << TARGET_PAGE_BITS,
1673 (run_end - run_start) << TARGET_PAGE_BITS);
1674 run_start = find_next_zero_bit(bitmap, range, run_end + 1);
1680 * postcopy_send_discard_bm_ram: discard a RAMBlock
1682 * Returns zero on success
1684 * Callback from postcopy_each_ram_send_discard for each RAMBlock
1685 * Note: At this point the 'unsentmap' is the processed bitmap combined
1686 * with the dirtymap; so a '1' means it's either dirty or unsent.
1688 * @ms: current migration state
1689 * @pds: state for postcopy
1690 * @start: RAMBlock starting page
1691 * @length: RAMBlock size
1693 static int postcopy_send_discard_bm_ram(MigrationState *ms,
1694 PostcopyDiscardState *pds,
1695 RAMBlock *block)
1697 unsigned long end = block->used_length >> TARGET_PAGE_BITS;
1698 unsigned long current;
1699 unsigned long *unsentmap = block->unsentmap;
1701 for (current = 0; current < end; ) {
1702 unsigned long one = find_next_bit(unsentmap, end, current);
1704 if (one <= end) {
1705 unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
1706 unsigned long discard_length;
1708 if (zero >= end) {
1709 discard_length = end - one;
1710 } else {
1711 discard_length = zero - one;
1713 if (discard_length) {
1714 postcopy_discard_send_range(ms, pds, one, discard_length);
1716 current = one + discard_length;
1717 } else {
1718 current = one;
1722 return 0;
1726 * postcopy_each_ram_send_discard: discard all RAMBlocks
1728 * Returns 0 for success or negative for error
1730 * Utility for the outgoing postcopy code.
1731 * Calls postcopy_send_discard_bm_ram for each RAMBlock
1732 * passing it bitmap indexes and name.
1733 * (qemu_ram_foreach_block ends up passing unscaled lengths
1734 * which would mean postcopy code would have to deal with target page)
1736 * @ms: current migration state
1738 static int postcopy_each_ram_send_discard(MigrationState *ms)
1740 struct RAMBlock *block;
1741 int ret;
1743 RAMBLOCK_FOREACH(block) {
1744 PostcopyDiscardState *pds =
1745 postcopy_discard_send_init(ms, block->idstr);
1748 * Postcopy sends chunks of bitmap over the wire, but it
1749 * just needs indexes at this point, avoids it having
1750 * target page specific code.
1752 ret = postcopy_send_discard_bm_ram(ms, pds, block);
1753 postcopy_discard_send_finish(ms, pds);
1754 if (ret) {
1755 return ret;
1759 return 0;
1763 * postcopy_chunk_hostpages_pass: canocalize bitmap in hostpages
1765 * Helper for postcopy_chunk_hostpages; it's called twice to
1766 * canonicalize the two bitmaps, that are similar, but one is
1767 * inverted.
1769 * Postcopy requires that all target pages in a hostpage are dirty or
1770 * clean, not a mix. This function canonicalizes the bitmaps.
1772 * @ms: current migration state
1773 * @unsent_pass: if true we need to canonicalize partially unsent host pages
1774 * otherwise we need to canonicalize partially dirty host pages
1775 * @block: block that contains the page we want to canonicalize
1776 * @pds: state for postcopy
1778 static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
1779 RAMBlock *block,
1780 PostcopyDiscardState *pds)
1782 RAMState *rs = ram_state;
1783 unsigned long *bitmap = block->bmap;
1784 unsigned long *unsentmap = block->unsentmap;
1785 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
1786 unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
1787 unsigned long run_start;
1789 if (block->page_size == TARGET_PAGE_SIZE) {
1790 /* Easy case - TPS==HPS for a non-huge page RAMBlock */
1791 return;
1794 if (unsent_pass) {
1795 /* Find a sent page */
1796 run_start = find_next_zero_bit(unsentmap, pages, 0);
1797 } else {
1798 /* Find a dirty page */
1799 run_start = find_next_bit(bitmap, pages, 0);
1802 while (run_start < pages) {
1803 bool do_fixup = false;
1804 unsigned long fixup_start_addr;
1805 unsigned long host_offset;
1808 * If the start of this run of pages is in the middle of a host
1809 * page, then we need to fixup this host page.
1811 host_offset = run_start % host_ratio;
1812 if (host_offset) {
1813 do_fixup = true;
1814 run_start -= host_offset;
1815 fixup_start_addr = run_start;
1816 /* For the next pass */
1817 run_start = run_start + host_ratio;
1818 } else {
1819 /* Find the end of this run */
1820 unsigned long run_end;
1821 if (unsent_pass) {
1822 run_end = find_next_bit(unsentmap, pages, run_start + 1);
1823 } else {
1824 run_end = find_next_zero_bit(bitmap, pages, run_start + 1);
1827 * If the end isn't at the start of a host page, then the
1828 * run doesn't finish at the end of a host page
1829 * and we need to discard.
1831 host_offset = run_end % host_ratio;
1832 if (host_offset) {
1833 do_fixup = true;
1834 fixup_start_addr = run_end - host_offset;
1836 * This host page has gone, the next loop iteration starts
1837 * from after the fixup
1839 run_start = fixup_start_addr + host_ratio;
1840 } else {
1842 * No discards on this iteration, next loop starts from
1843 * next sent/dirty page
1845 run_start = run_end + 1;
1849 if (do_fixup) {
1850 unsigned long page;
1852 /* Tell the destination to discard this page */
1853 if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
1854 /* For the unsent_pass we:
1855 * discard partially sent pages
1856 * For the !unsent_pass (dirty) we:
1857 * discard partially dirty pages that were sent
1858 * (any partially sent pages were already discarded
1859 * by the previous unsent_pass)
1861 postcopy_discard_send_range(ms, pds, fixup_start_addr,
1862 host_ratio);
1865 /* Clean up the bitmap */
1866 for (page = fixup_start_addr;
1867 page < fixup_start_addr + host_ratio; page++) {
1868 /* All pages in this host page are now not sent */
1869 set_bit(page, unsentmap);
1872 * Remark them as dirty, updating the count for any pages
1873 * that weren't previously dirty.
1875 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
1879 if (unsent_pass) {
1880 /* Find the next sent page for the next iteration */
1881 run_start = find_next_zero_bit(unsentmap, pages, run_start);
1882 } else {
1883 /* Find the next dirty page for the next iteration */
1884 run_start = find_next_bit(bitmap, pages, run_start);
1890 * postcopy_chuck_hostpages: discrad any partially sent host page
1892 * Utility for the outgoing postcopy code.
1894 * Discard any partially sent host-page size chunks, mark any partially
1895 * dirty host-page size chunks as all dirty. In this case the host-page
1896 * is the host-page for the particular RAMBlock, i.e. it might be a huge page
1898 * Returns zero on success
1900 * @ms: current migration state
1901 * @block: block we want to work with
1903 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
1905 PostcopyDiscardState *pds =
1906 postcopy_discard_send_init(ms, block->idstr);
1908 /* First pass: Discard all partially sent host pages */
1909 postcopy_chunk_hostpages_pass(ms, true, block, pds);
1911 * Second pass: Ensure that all partially dirty host pages are made
1912 * fully dirty.
1914 postcopy_chunk_hostpages_pass(ms, false, block, pds);
1916 postcopy_discard_send_finish(ms, pds);
1917 return 0;
1921 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
1923 * Returns zero on success
1925 * Transmit the set of pages to be discarded after precopy to the target
1926 * these are pages that:
1927 * a) Have been previously transmitted but are now dirty again
1928 * b) Pages that have never been transmitted, this ensures that
1929 * any pages on the destination that have been mapped by background
1930 * tasks get discarded (transparent huge pages is the specific concern)
1931 * Hopefully this is pretty sparse
1933 * @ms: current migration state
1935 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
1937 RAMState *rs = ram_state;
1938 RAMBlock *block;
1939 int ret;
1941 rcu_read_lock();
1943 /* This should be our last sync, the src is now paused */
1944 migration_bitmap_sync(rs);
1946 /* Easiest way to make sure we don't resume in the middle of a host-page */
1947 rs->last_seen_block = NULL;
1948 rs->last_sent_block = NULL;
1949 rs->last_page = 0;
1951 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1952 unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
1953 unsigned long *bitmap = block->bmap;
1954 unsigned long *unsentmap = block->unsentmap;
1956 if (!unsentmap) {
1957 /* We don't have a safe way to resize the sentmap, so
1958 * if the bitmap was resized it will be NULL at this
1959 * point.
1961 error_report("migration ram resized during precopy phase");
1962 rcu_read_unlock();
1963 return -EINVAL;
1965 /* Deal with TPS != HPS and huge pages */
1966 ret = postcopy_chunk_hostpages(ms, block);
1967 if (ret) {
1968 rcu_read_unlock();
1969 return ret;
1973 * Update the unsentmap to be unsentmap = unsentmap | dirty
1975 bitmap_or(unsentmap, unsentmap, bitmap, pages);
1976 #ifdef DEBUG_POSTCOPY
1977 ram_debug_dump_bitmap(unsentmap, true, pages);
1978 #endif
1980 trace_ram_postcopy_send_discard_bitmap();
1982 ret = postcopy_each_ram_send_discard(ms);
1983 rcu_read_unlock();
1985 return ret;
1989 * ram_discard_range: discard dirtied pages at the beginning of postcopy
1991 * Returns zero on success
1993 * @rbname: name of the RAMBlock of the request. NULL means the
1994 * same that last one.
1995 * @start: RAMBlock starting page
1996 * @length: RAMBlock size
1998 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2000 int ret = -1;
2002 trace_ram_discard_range(rbname, start, length);
2004 rcu_read_lock();
2005 RAMBlock *rb = qemu_ram_block_by_name(rbname);
2007 if (!rb) {
2008 error_report("ram_discard_range: Failed to find block '%s'", rbname);
2009 goto err;
2012 ret = ram_block_discard_range(rb, start, length);
2014 err:
2015 rcu_read_unlock();
2017 return ret;
2020 static int ram_state_init(RAMState **rsp)
2022 *rsp = g_new0(RAMState, 1);
2023 Error *local_err = NULL;
2025 qemu_mutex_init(&(*rsp)->bitmap_mutex);
2026 qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2027 QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2029 if (migrate_use_xbzrle()) {
2030 XBZRLE_cache_lock();
2031 XBZRLE.zero_target_page = g_malloc0(TARGET_PAGE_SIZE);
2032 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2033 TARGET_PAGE_SIZE, &local_err);
2034 if (!XBZRLE.cache) {
2035 XBZRLE_cache_unlock();
2036 error_report_err(local_err);
2037 g_free(*rsp);
2038 *rsp = NULL;
2039 return -1;
2041 XBZRLE_cache_unlock();
2043 /* We prefer not to abort if there is no memory */
2044 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2045 if (!XBZRLE.encoded_buf) {
2046 error_report("Error allocating encoded_buf");
2047 g_free(*rsp);
2048 *rsp = NULL;
2049 return -1;
2052 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2053 if (!XBZRLE.current_buf) {
2054 error_report("Error allocating current_buf");
2055 g_free(XBZRLE.encoded_buf);
2056 XBZRLE.encoded_buf = NULL;
2057 g_free(*rsp);
2058 *rsp = NULL;
2059 return -1;
2063 /* For memory_global_dirty_log_start below. */
2064 qemu_mutex_lock_iothread();
2066 qemu_mutex_lock_ramlist();
2067 rcu_read_lock();
2068 ram_state_reset(*rsp);
2070 /* Skip setting bitmap if there is no RAM */
2071 if (ram_bytes_total()) {
2072 RAMBlock *block;
2074 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
2075 unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
2077 block->bmap = bitmap_new(pages);
2078 bitmap_set(block->bmap, 0, pages);
2079 if (migrate_postcopy_ram()) {
2080 block->unsentmap = bitmap_new(pages);
2081 bitmap_set(block->unsentmap, 0, pages);
2087 * Count the total number of pages used by ram blocks not including any
2088 * gaps due to alignment or unplugs.
2090 (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2092 memory_global_dirty_log_start();
2093 migration_bitmap_sync(*rsp);
2094 qemu_mutex_unlock_ramlist();
2095 qemu_mutex_unlock_iothread();
2096 rcu_read_unlock();
2098 return 0;
2102 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
2103 * long-running RCU critical section. When rcu-reclaims in the code
2104 * start to become numerous it will be necessary to reduce the
2105 * granularity of these critical sections.
2109 * ram_save_setup: Setup RAM for migration
2111 * Returns zero to indicate success and negative for error
2113 * @f: QEMUFile where to send the data
2114 * @opaque: RAMState pointer
2116 static int ram_save_setup(QEMUFile *f, void *opaque)
2118 RAMState **rsp = opaque;
2119 RAMBlock *block;
2121 /* migration has already setup the bitmap, reuse it. */
2122 if (!migration_in_colo_state()) {
2123 if (ram_state_init(rsp) != 0) {
2124 return -1;
2127 (*rsp)->f = f;
2129 rcu_read_lock();
2131 qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
2133 RAMBLOCK_FOREACH(block) {
2134 qemu_put_byte(f, strlen(block->idstr));
2135 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2136 qemu_put_be64(f, block->used_length);
2137 if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) {
2138 qemu_put_be64(f, block->page_size);
2142 rcu_read_unlock();
2143 compress_threads_save_setup();
2145 ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2146 ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2148 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2150 return 0;
2154 * ram_save_iterate: iterative stage for migration
2156 * Returns zero to indicate success and negative for error
2158 * @f: QEMUFile where to send the data
2159 * @opaque: RAMState pointer
2161 static int ram_save_iterate(QEMUFile *f, void *opaque)
2163 RAMState **temp = opaque;
2164 RAMState *rs = *temp;
2165 int ret;
2166 int i;
2167 int64_t t0;
2168 int done = 0;
2170 rcu_read_lock();
2171 if (ram_list.version != rs->last_version) {
2172 ram_state_reset(rs);
2175 /* Read version before ram_list.blocks */
2176 smp_rmb();
2178 ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2180 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2181 i = 0;
2182 while ((ret = qemu_file_rate_limit(f)) == 0) {
2183 int pages;
2185 pages = ram_find_and_save_block(rs, false);
2186 /* no more pages to sent */
2187 if (pages == 0) {
2188 done = 1;
2189 break;
2191 rs->iterations++;
2193 /* we want to check in the 1st loop, just in case it was the 1st time
2194 and we had to sync the dirty bitmap.
2195 qemu_get_clock_ns() is a bit expensive, so we only check each some
2196 iterations
2198 if ((i & 63) == 0) {
2199 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
2200 if (t1 > MAX_WAIT) {
2201 trace_ram_save_iterate_big_wait(t1, i);
2202 break;
2205 i++;
2207 flush_compressed_data(rs);
2208 rcu_read_unlock();
2211 * Must occur before EOS (or any QEMUFile operation)
2212 * because of RDMA protocol.
2214 ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2216 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2217 ram_counters.transferred += 8;
2219 ret = qemu_file_get_error(f);
2220 if (ret < 0) {
2221 return ret;
2224 return done;
2228 * ram_save_complete: function called to send the remaining amount of ram
2230 * Returns zero to indicate success
2232 * Called with iothread lock
2234 * @f: QEMUFile where to send the data
2235 * @opaque: RAMState pointer
2237 static int ram_save_complete(QEMUFile *f, void *opaque)
2239 RAMState **temp = opaque;
2240 RAMState *rs = *temp;
2242 rcu_read_lock();
2244 if (!migration_in_postcopy()) {
2245 migration_bitmap_sync(rs);
2248 ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2250 /* try transferring iterative blocks of memory */
2252 /* flush all remaining blocks regardless of rate limiting */
2253 while (true) {
2254 int pages;
2256 pages = ram_find_and_save_block(rs, !migration_in_colo_state());
2257 /* no more blocks to sent */
2258 if (pages == 0) {
2259 break;
2263 flush_compressed_data(rs);
2264 ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2266 rcu_read_unlock();
2268 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2270 return 0;
2273 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2274 uint64_t *non_postcopiable_pending,
2275 uint64_t *postcopiable_pending)
2277 RAMState **temp = opaque;
2278 RAMState *rs = *temp;
2279 uint64_t remaining_size;
2281 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2283 if (!migration_in_postcopy() &&
2284 remaining_size < max_size) {
2285 qemu_mutex_lock_iothread();
2286 rcu_read_lock();
2287 migration_bitmap_sync(rs);
2288 rcu_read_unlock();
2289 qemu_mutex_unlock_iothread();
2290 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2293 if (migrate_postcopy_ram()) {
2294 /* We can do postcopy, and all the data is postcopiable */
2295 *postcopiable_pending += remaining_size;
2296 } else {
2297 *non_postcopiable_pending += remaining_size;
2301 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2303 unsigned int xh_len;
2304 int xh_flags;
2305 uint8_t *loaded_data;
2307 /* extract RLE header */
2308 xh_flags = qemu_get_byte(f);
2309 xh_len = qemu_get_be16(f);
2311 if (xh_flags != ENCODING_FLAG_XBZRLE) {
2312 error_report("Failed to load XBZRLE page - wrong compression!");
2313 return -1;
2316 if (xh_len > TARGET_PAGE_SIZE) {
2317 error_report("Failed to load XBZRLE page - len overflow!");
2318 return -1;
2320 loaded_data = XBZRLE.decoded_buf;
2321 /* load data and decode */
2322 /* it can change loaded_data to point to an internal buffer */
2323 qemu_get_buffer_in_place(f, &loaded_data, xh_len);
2325 /* decode RLE */
2326 if (xbzrle_decode_buffer(loaded_data, xh_len, host,
2327 TARGET_PAGE_SIZE) == -1) {
2328 error_report("Failed to load XBZRLE page - decode error!");
2329 return -1;
2332 return 0;
2336 * ram_block_from_stream: read a RAMBlock id from the migration stream
2338 * Must be called from within a rcu critical section.
2340 * Returns a pointer from within the RCU-protected ram_list.
2342 * @f: QEMUFile where to read the data from
2343 * @flags: Page flags (mostly to see if it's a continuation of previous block)
2345 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
2347 static RAMBlock *block = NULL;
2348 char id[256];
2349 uint8_t len;
2351 if (flags & RAM_SAVE_FLAG_CONTINUE) {
2352 if (!block) {
2353 error_report("Ack, bad migration stream!");
2354 return NULL;
2356 return block;
2359 len = qemu_get_byte(f);
2360 qemu_get_buffer(f, (uint8_t *)id, len);
2361 id[len] = 0;
2363 block = qemu_ram_block_by_name(id);
2364 if (!block) {
2365 error_report("Can't find block %s", id);
2366 return NULL;
2369 return block;
2372 static inline void *host_from_ram_block_offset(RAMBlock *block,
2373 ram_addr_t offset)
2375 if (!offset_in_ramblock(block, offset)) {
2376 return NULL;
2379 return block->host + offset;
2383 * ram_handle_compressed: handle the zero page case
2385 * If a page (or a whole RDMA chunk) has been
2386 * determined to be zero, then zap it.
2388 * @host: host address for the zero page
2389 * @ch: what the page is filled from. We only support zero
2390 * @size: size of the zero page
2392 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2394 if (ch != 0 || !is_zero_range(host, size)) {
2395 memset(host, ch, size);
2399 static void *do_data_decompress(void *opaque)
2401 DecompressParam *param = opaque;
2402 unsigned long pagesize;
2403 uint8_t *des;
2404 int len;
2406 qemu_mutex_lock(&param->mutex);
2407 while (!param->quit) {
2408 if (param->des) {
2409 des = param->des;
2410 len = param->len;
2411 param->des = 0;
2412 qemu_mutex_unlock(&param->mutex);
2414 pagesize = TARGET_PAGE_SIZE;
2415 /* uncompress() will return failed in some case, especially
2416 * when the page is dirted when doing the compression, it's
2417 * not a problem because the dirty page will be retransferred
2418 * and uncompress() won't break the data in other pages.
2420 uncompress((Bytef *)des, &pagesize,
2421 (const Bytef *)param->compbuf, len);
2423 qemu_mutex_lock(&decomp_done_lock);
2424 param->done = true;
2425 qemu_cond_signal(&decomp_done_cond);
2426 qemu_mutex_unlock(&decomp_done_lock);
2428 qemu_mutex_lock(&param->mutex);
2429 } else {
2430 qemu_cond_wait(&param->cond, &param->mutex);
2433 qemu_mutex_unlock(&param->mutex);
2435 return NULL;
2438 static void wait_for_decompress_done(void)
2440 int idx, thread_count;
2442 if (!migrate_use_compression()) {
2443 return;
2446 thread_count = migrate_decompress_threads();
2447 qemu_mutex_lock(&decomp_done_lock);
2448 for (idx = 0; idx < thread_count; idx++) {
2449 while (!decomp_param[idx].done) {
2450 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2453 qemu_mutex_unlock(&decomp_done_lock);
2456 static void compress_threads_load_setup(void)
2458 int i, thread_count;
2460 if (!migrate_use_compression()) {
2461 return;
2463 thread_count = migrate_decompress_threads();
2464 decompress_threads = g_new0(QemuThread, thread_count);
2465 decomp_param = g_new0(DecompressParam, thread_count);
2466 qemu_mutex_init(&decomp_done_lock);
2467 qemu_cond_init(&decomp_done_cond);
2468 for (i = 0; i < thread_count; i++) {
2469 qemu_mutex_init(&decomp_param[i].mutex);
2470 qemu_cond_init(&decomp_param[i].cond);
2471 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
2472 decomp_param[i].done = true;
2473 decomp_param[i].quit = false;
2474 qemu_thread_create(decompress_threads + i, "decompress",
2475 do_data_decompress, decomp_param + i,
2476 QEMU_THREAD_JOINABLE);
2480 static void compress_threads_load_cleanup(void)
2482 int i, thread_count;
2484 if (!migrate_use_compression()) {
2485 return;
2487 thread_count = migrate_decompress_threads();
2488 for (i = 0; i < thread_count; i++) {
2489 qemu_mutex_lock(&decomp_param[i].mutex);
2490 decomp_param[i].quit = true;
2491 qemu_cond_signal(&decomp_param[i].cond);
2492 qemu_mutex_unlock(&decomp_param[i].mutex);
2494 for (i = 0; i < thread_count; i++) {
2495 qemu_thread_join(decompress_threads + i);
2496 qemu_mutex_destroy(&decomp_param[i].mutex);
2497 qemu_cond_destroy(&decomp_param[i].cond);
2498 g_free(decomp_param[i].compbuf);
2500 g_free(decompress_threads);
2501 g_free(decomp_param);
2502 decompress_threads = NULL;
2503 decomp_param = NULL;
2506 static void decompress_data_with_multi_threads(QEMUFile *f,
2507 void *host, int len)
2509 int idx, thread_count;
2511 thread_count = migrate_decompress_threads();
2512 qemu_mutex_lock(&decomp_done_lock);
2513 while (true) {
2514 for (idx = 0; idx < thread_count; idx++) {
2515 if (decomp_param[idx].done) {
2516 decomp_param[idx].done = false;
2517 qemu_mutex_lock(&decomp_param[idx].mutex);
2518 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
2519 decomp_param[idx].des = host;
2520 decomp_param[idx].len = len;
2521 qemu_cond_signal(&decomp_param[idx].cond);
2522 qemu_mutex_unlock(&decomp_param[idx].mutex);
2523 break;
2526 if (idx < thread_count) {
2527 break;
2528 } else {
2529 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2532 qemu_mutex_unlock(&decomp_done_lock);
2536 * ram_load_setup: Setup RAM for migration incoming side
2538 * Returns zero to indicate success and negative for error
2540 * @f: QEMUFile where to receive the data
2541 * @opaque: RAMState pointer
2543 static int ram_load_setup(QEMUFile *f, void *opaque)
2545 xbzrle_load_setup();
2546 compress_threads_load_setup();
2547 return 0;
2550 static int ram_load_cleanup(void *opaque)
2552 xbzrle_load_cleanup();
2553 compress_threads_load_cleanup();
2554 return 0;
2558 * ram_postcopy_incoming_init: allocate postcopy data structures
2560 * Returns 0 for success and negative if there was one error
2562 * @mis: current migration incoming state
2564 * Allocate data structures etc needed by incoming migration with
2565 * postcopy-ram. postcopy-ram's similarly names
2566 * postcopy_ram_incoming_init does the work.
2568 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
2570 unsigned long ram_pages = last_ram_page();
2572 return postcopy_ram_incoming_init(mis, ram_pages);
2576 * ram_load_postcopy: load a page in postcopy case
2578 * Returns 0 for success or -errno in case of error
2580 * Called in postcopy mode by ram_load().
2581 * rcu_read_lock is taken prior to this being called.
2583 * @f: QEMUFile where to send the data
2585 static int ram_load_postcopy(QEMUFile *f)
2587 int flags = 0, ret = 0;
2588 bool place_needed = false;
2589 bool matching_page_sizes = false;
2590 MigrationIncomingState *mis = migration_incoming_get_current();
2591 /* Temporary page that is later 'placed' */
2592 void *postcopy_host_page = postcopy_get_tmp_page(mis);
2593 void *last_host = NULL;
2594 bool all_zero = false;
2596 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2597 ram_addr_t addr;
2598 void *host = NULL;
2599 void *page_buffer = NULL;
2600 void *place_source = NULL;
2601 RAMBlock *block = NULL;
2602 uint8_t ch;
2604 addr = qemu_get_be64(f);
2605 flags = addr & ~TARGET_PAGE_MASK;
2606 addr &= TARGET_PAGE_MASK;
2608 trace_ram_load_postcopy_loop((uint64_t)addr, flags);
2609 place_needed = false;
2610 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE)) {
2611 block = ram_block_from_stream(f, flags);
2613 host = host_from_ram_block_offset(block, addr);
2614 if (!host) {
2615 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2616 ret = -EINVAL;
2617 break;
2619 matching_page_sizes = block->page_size == TARGET_PAGE_SIZE;
2621 * Postcopy requires that we place whole host pages atomically;
2622 * these may be huge pages for RAMBlocks that are backed by
2623 * hugetlbfs.
2624 * To make it atomic, the data is read into a temporary page
2625 * that's moved into place later.
2626 * The migration protocol uses, possibly smaller, target-pages
2627 * however the source ensures it always sends all the components
2628 * of a host page in order.
2630 page_buffer = postcopy_host_page +
2631 ((uintptr_t)host & (block->page_size - 1));
2632 /* If all TP are zero then we can optimise the place */
2633 if (!((uintptr_t)host & (block->page_size - 1))) {
2634 all_zero = true;
2635 } else {
2636 /* not the 1st TP within the HP */
2637 if (host != (last_host + TARGET_PAGE_SIZE)) {
2638 error_report("Non-sequential target page %p/%p",
2639 host, last_host);
2640 ret = -EINVAL;
2641 break;
2647 * If it's the last part of a host page then we place the host
2648 * page
2650 place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
2651 (block->page_size - 1)) == 0;
2652 place_source = postcopy_host_page;
2654 last_host = host;
2656 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2657 case RAM_SAVE_FLAG_ZERO:
2658 ch = qemu_get_byte(f);
2659 memset(page_buffer, ch, TARGET_PAGE_SIZE);
2660 if (ch) {
2661 all_zero = false;
2663 break;
2665 case RAM_SAVE_FLAG_PAGE:
2666 all_zero = false;
2667 if (!place_needed || !matching_page_sizes) {
2668 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
2669 } else {
2670 /* Avoids the qemu_file copy during postcopy, which is
2671 * going to do a copy later; can only do it when we
2672 * do this read in one go (matching page sizes)
2674 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
2675 TARGET_PAGE_SIZE);
2677 break;
2678 case RAM_SAVE_FLAG_EOS:
2679 /* normal exit */
2680 break;
2681 default:
2682 error_report("Unknown combination of migration flags: %#x"
2683 " (postcopy mode)", flags);
2684 ret = -EINVAL;
2687 if (place_needed) {
2688 /* This gets called at the last target page in the host page */
2689 void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;
2691 if (all_zero) {
2692 ret = postcopy_place_page_zero(mis, place_dest,
2693 block->page_size);
2694 } else {
2695 ret = postcopy_place_page(mis, place_dest,
2696 place_source, block->page_size);
2699 if (!ret) {
2700 ret = qemu_file_get_error(f);
2704 return ret;
2707 static int ram_load(QEMUFile *f, void *opaque, int version_id)
2709 int flags = 0, ret = 0, invalid_flags = 0;
2710 static uint64_t seq_iter;
2711 int len = 0;
2713 * If system is running in postcopy mode, page inserts to host memory must
2714 * be atomic
2716 bool postcopy_running = postcopy_state_get() >= POSTCOPY_INCOMING_LISTENING;
2717 /* ADVISE is earlier, it shows the source has the postcopy capability on */
2718 bool postcopy_advised = postcopy_state_get() >= POSTCOPY_INCOMING_ADVISE;
2720 seq_iter++;
2722 if (version_id != 4) {
2723 ret = -EINVAL;
2726 if (!migrate_use_compression()) {
2727 invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
2729 /* This RCU critical section can be very long running.
2730 * When RCU reclaims in the code start to become numerous,
2731 * it will be necessary to reduce the granularity of this
2732 * critical section.
2734 rcu_read_lock();
2736 if (postcopy_running) {
2737 ret = ram_load_postcopy(f);
2740 while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2741 ram_addr_t addr, total_ram_bytes;
2742 void *host = NULL;
2743 uint8_t ch;
2745 addr = qemu_get_be64(f);
2746 flags = addr & ~TARGET_PAGE_MASK;
2747 addr &= TARGET_PAGE_MASK;
2749 if (flags & invalid_flags) {
2750 if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
2751 error_report("Received an unexpected compressed page");
2754 ret = -EINVAL;
2755 break;
2758 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
2759 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
2760 RAMBlock *block = ram_block_from_stream(f, flags);
2762 host = host_from_ram_block_offset(block, addr);
2763 if (!host) {
2764 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2765 ret = -EINVAL;
2766 break;
2768 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
2771 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2772 case RAM_SAVE_FLAG_MEM_SIZE:
2773 /* Synchronize RAM block list */
2774 total_ram_bytes = addr;
2775 while (!ret && total_ram_bytes) {
2776 RAMBlock *block;
2777 char id[256];
2778 ram_addr_t length;
2780 len = qemu_get_byte(f);
2781 qemu_get_buffer(f, (uint8_t *)id, len);
2782 id[len] = 0;
2783 length = qemu_get_be64(f);
2785 block = qemu_ram_block_by_name(id);
2786 if (block) {
2787 if (length != block->used_length) {
2788 Error *local_err = NULL;
2790 ret = qemu_ram_resize(block, length,
2791 &local_err);
2792 if (local_err) {
2793 error_report_err(local_err);
2796 /* For postcopy we need to check hugepage sizes match */
2797 if (postcopy_advised &&
2798 block->page_size != qemu_host_page_size) {
2799 uint64_t remote_page_size = qemu_get_be64(f);
2800 if (remote_page_size != block->page_size) {
2801 error_report("Mismatched RAM page size %s "
2802 "(local) %zd != %" PRId64,
2803 id, block->page_size,
2804 remote_page_size);
2805 ret = -EINVAL;
2808 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
2809 block->idstr);
2810 } else {
2811 error_report("Unknown ramblock \"%s\", cannot "
2812 "accept migration", id);
2813 ret = -EINVAL;
2816 total_ram_bytes -= length;
2818 break;
2820 case RAM_SAVE_FLAG_ZERO:
2821 ch = qemu_get_byte(f);
2822 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
2823 break;
2825 case RAM_SAVE_FLAG_PAGE:
2826 qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
2827 break;
2829 case RAM_SAVE_FLAG_COMPRESS_PAGE:
2830 len = qemu_get_be32(f);
2831 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
2832 error_report("Invalid compressed data length: %d", len);
2833 ret = -EINVAL;
2834 break;
2836 decompress_data_with_multi_threads(f, host, len);
2837 break;
2839 case RAM_SAVE_FLAG_XBZRLE:
2840 if (load_xbzrle(f, addr, host) < 0) {
2841 error_report("Failed to decompress XBZRLE page at "
2842 RAM_ADDR_FMT, addr);
2843 ret = -EINVAL;
2844 break;
2846 break;
2847 case RAM_SAVE_FLAG_EOS:
2848 /* normal exit */
2849 break;
2850 default:
2851 if (flags & RAM_SAVE_FLAG_HOOK) {
2852 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
2853 } else {
2854 error_report("Unknown combination of migration flags: %#x",
2855 flags);
2856 ret = -EINVAL;
2859 if (!ret) {
2860 ret = qemu_file_get_error(f);
2864 wait_for_decompress_done();
2865 rcu_read_unlock();
2866 trace_ram_load_complete(ret, seq_iter);
2867 return ret;
2870 static bool ram_has_postcopy(void *opaque)
2872 return migrate_postcopy_ram();
2875 static SaveVMHandlers savevm_ram_handlers = {
2876 .save_setup = ram_save_setup,
2877 .save_live_iterate = ram_save_iterate,
2878 .save_live_complete_postcopy = ram_save_complete,
2879 .save_live_complete_precopy = ram_save_complete,
2880 .has_postcopy = ram_has_postcopy,
2881 .save_live_pending = ram_save_pending,
2882 .load_state = ram_load,
2883 .save_cleanup = ram_save_cleanup,
2884 .load_setup = ram_load_setup,
2885 .load_cleanup = ram_load_cleanup,
2888 void ram_mig_init(void)
2890 qemu_mutex_init(&XBZRLE.lock);
2891 register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, &ram_state);