Merge commit '25be210f69543faae4f22bfa2c7446c4f1be62ef' into upstream-merge
[qemu-kvm/amd-iommu.git] / qemu-kvm.c
blobe168db2c3c2cea6b438de88ff2aa286405a3ef3b
1 /*
2 * qemu/kvm integration
4 * Copyright (C) 2006-2008 Qumranet Technologies
6 * Licensed under the terms of the GNU GPL version 2 or higher.
7 */
8 #include "config.h"
9 #include "config-host.h"
11 #include <assert.h>
12 #include <string.h>
13 #include "hw/hw.h"
14 #include "sysemu.h"
15 #include "qemu-common.h"
16 #include "console.h"
17 #include "block.h"
18 #include "compatfd.h"
19 #include "gdbstub.h"
21 #include "qemu-kvm.h"
22 #include "libkvm.h"
24 #include <pthread.h>
25 #include <sys/utsname.h>
26 #include <sys/syscall.h>
27 #include <sys/mman.h>
28 #include <sys/ioctl.h>
29 #include "compatfd.h"
30 #include <sys/prctl.h>
32 #define false 0
33 #define true 1
35 #ifndef PR_MCE_KILL
36 #define PR_MCE_KILL 33
37 #endif
39 #ifndef BUS_MCEERR_AR
40 #define BUS_MCEERR_AR 4
41 #endif
42 #ifndef BUS_MCEERR_AO
43 #define BUS_MCEERR_AO 5
44 #endif
46 #define EXPECTED_KVM_API_VERSION 12
48 #if EXPECTED_KVM_API_VERSION != KVM_API_VERSION
49 #error libkvm: userspace and kernel version mismatch
50 #endif
52 int kvm_allowed = 1;
53 int kvm_irqchip = 1;
54 int kvm_pit = 1;
55 int kvm_pit_reinject = 1;
56 int kvm_nested = 0;
59 KVMState *kvm_state;
60 kvm_context_t kvm_context;
62 pthread_mutex_t qemu_mutex = PTHREAD_MUTEX_INITIALIZER;
63 pthread_cond_t qemu_vcpu_cond = PTHREAD_COND_INITIALIZER;
64 pthread_cond_t qemu_system_cond = PTHREAD_COND_INITIALIZER;
65 pthread_cond_t qemu_pause_cond = PTHREAD_COND_INITIALIZER;
66 pthread_cond_t qemu_work_cond = PTHREAD_COND_INITIALIZER;
67 __thread CPUState *current_env;
69 static int qemu_system_ready;
71 #define SIG_IPI (SIGRTMIN+4)
73 pthread_t io_thread;
74 static int io_thread_fd = -1;
75 static int io_thread_sigfd = -1;
77 static CPUState *kvm_debug_cpu_requested;
79 static uint64_t phys_ram_size;
81 #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
82 /* The list of ioperm_data */
83 static QLIST_HEAD(, ioperm_data) ioperm_head;
84 #endif
86 //#define DEBUG_MEMREG
87 #ifdef DEBUG_MEMREG
88 #define DPRINTF(fmt, args...) \
89 do { fprintf(stderr, "%s:%d " fmt , __func__, __LINE__, ##args); } while (0)
90 #else
91 #define DPRINTF(fmt, args...) do {} while (0)
92 #endif
94 #define ALIGN(x, y) (((x)+(y)-1) & ~((y)-1))
96 int kvm_abi = EXPECTED_KVM_API_VERSION;
97 int kvm_page_size;
99 #ifdef KVM_CAP_SET_GUEST_DEBUG
100 static int kvm_debug(CPUState *env,
101 struct kvm_debug_exit_arch *arch_info)
103 int handle = kvm_arch_debug(arch_info);
105 if (handle) {
106 kvm_debug_cpu_requested = env;
107 env->stopped = 1;
109 return handle;
111 #endif
113 static int handle_unhandled(uint64_t reason)
115 fprintf(stderr, "kvm: unhandled exit %" PRIx64 "\n", reason);
116 return -EINVAL;
120 static inline void set_gsi(kvm_context_t kvm, unsigned int gsi)
122 uint32_t *bitmap = kvm->used_gsi_bitmap;
124 if (gsi < kvm->max_gsi)
125 bitmap[gsi / 32] |= 1U << (gsi % 32);
126 else
127 DPRINTF("Invalid GSI %d\n");
130 static inline void clear_gsi(kvm_context_t kvm, unsigned int gsi)
132 uint32_t *bitmap = kvm->used_gsi_bitmap;
134 if (gsi < kvm->max_gsi)
135 bitmap[gsi / 32] &= ~(1U << (gsi % 32));
136 else
137 DPRINTF("Invalid GSI %d\n");
140 struct slot_info {
141 unsigned long phys_addr;
142 unsigned long len;
143 unsigned long userspace_addr;
144 unsigned flags;
145 int logging_count;
148 struct slot_info slots[KVM_MAX_NUM_MEM_REGIONS];
150 static void init_slots(void)
152 int i;
154 for (i = 0; i < KVM_MAX_NUM_MEM_REGIONS; ++i)
155 slots[i].len = 0;
158 static int get_free_slot(kvm_context_t kvm)
160 int i;
161 int tss_ext;
163 #if defined(KVM_CAP_SET_TSS_ADDR) && !defined(__s390__)
164 tss_ext = kvm_ioctl(kvm_state, KVM_CHECK_EXTENSION, KVM_CAP_SET_TSS_ADDR);
165 #else
166 tss_ext = 0;
167 #endif
170 * on older kernels where the set tss ioctl is not supprted we must save
171 * slot 0 to hold the extended memory, as the vmx will use the last 3
172 * pages of this slot.
174 if (tss_ext > 0)
175 i = 0;
176 else
177 i = 1;
179 for (; i < KVM_MAX_NUM_MEM_REGIONS; ++i)
180 if (!slots[i].len)
181 return i;
182 return -1;
185 static void register_slot(int slot, unsigned long phys_addr,
186 unsigned long len, unsigned long userspace_addr,
187 unsigned flags)
189 slots[slot].phys_addr = phys_addr;
190 slots[slot].len = len;
191 slots[slot].userspace_addr = userspace_addr;
192 slots[slot].flags = flags;
195 static void free_slot(int slot)
197 slots[slot].len = 0;
198 slots[slot].logging_count = 0;
201 static int get_slot(unsigned long phys_addr)
203 int i;
205 for (i = 0; i < KVM_MAX_NUM_MEM_REGIONS; ++i) {
206 if (slots[i].len && slots[i].phys_addr <= phys_addr &&
207 (slots[i].phys_addr + slots[i].len - 1) >= phys_addr)
208 return i;
210 return -1;
213 /* Returns -1 if this slot is not totally contained on any other,
214 * and the number of the slot otherwise */
215 static int get_container_slot(uint64_t phys_addr, unsigned long size)
217 int i;
219 for (i = 0; i < KVM_MAX_NUM_MEM_REGIONS; ++i)
220 if (slots[i].len && slots[i].phys_addr <= phys_addr &&
221 (slots[i].phys_addr + slots[i].len) >= phys_addr + size)
222 return i;
223 return -1;
226 int kvm_is_containing_region(kvm_context_t kvm, unsigned long phys_addr,
227 unsigned long size)
229 int slot = get_container_slot(phys_addr, size);
230 if (slot == -1)
231 return 0;
232 return 1;
236 * dirty pages logging control
238 static int kvm_dirty_pages_log_change(kvm_context_t kvm,
239 unsigned long phys_addr, unsigned flags,
240 unsigned mask)
242 int r = -1;
243 int slot = get_slot(phys_addr);
245 if (slot == -1) {
246 fprintf(stderr, "BUG: %s: invalid parameters\n", __FUNCTION__);
247 return 1;
250 flags = (slots[slot].flags & ~mask) | flags;
251 if (flags == slots[slot].flags)
252 return 0;
253 slots[slot].flags = flags;
256 struct kvm_userspace_memory_region mem = {
257 .slot = slot,
258 .memory_size = slots[slot].len,
259 .guest_phys_addr = slots[slot].phys_addr,
260 .userspace_addr = slots[slot].userspace_addr,
261 .flags = slots[slot].flags,
265 DPRINTF("slot %d start %llx len %llx flags %x\n",
266 mem.slot, mem.guest_phys_addr, mem.memory_size, mem.flags);
267 r = kvm_vm_ioctl(kvm_state, KVM_SET_USER_MEMORY_REGION, &mem);
268 if (r < 0)
269 fprintf(stderr, "%s: %m\n", __FUNCTION__);
271 return r;
274 static int kvm_dirty_pages_log_change_all(kvm_context_t kvm,
275 int (*change)(kvm_context_t kvm,
276 uint64_t start,
277 uint64_t len))
279 int i, r;
281 for (i = r = 0; i < KVM_MAX_NUM_MEM_REGIONS && r == 0; i++) {
282 if (slots[i].len)
283 r = change(kvm, slots[i].phys_addr, slots[i].len);
285 return r;
288 int kvm_dirty_pages_log_enable_slot(kvm_context_t kvm, uint64_t phys_addr,
289 uint64_t len)
291 int slot = get_slot(phys_addr);
293 DPRINTF("start %" PRIx64 " len %" PRIx64 "\n", phys_addr, len);
294 if (slot == -1) {
295 fprintf(stderr, "BUG: %s: invalid parameters\n", __func__);
296 return -EINVAL;
299 if (slots[slot].logging_count++)
300 return 0;
302 return kvm_dirty_pages_log_change(kvm, slots[slot].phys_addr,
303 KVM_MEM_LOG_DIRTY_PAGES,
304 KVM_MEM_LOG_DIRTY_PAGES);
307 int kvm_dirty_pages_log_disable_slot(kvm_context_t kvm, uint64_t phys_addr,
308 uint64_t len)
310 int slot = get_slot(phys_addr);
312 if (slot == -1) {
313 fprintf(stderr, "BUG: %s: invalid parameters\n", __func__);
314 return -EINVAL;
317 if (--slots[slot].logging_count)
318 return 0;
320 return kvm_dirty_pages_log_change(kvm, slots[slot].phys_addr, 0,
321 KVM_MEM_LOG_DIRTY_PAGES);
325 * Enable dirty page logging for all memory regions
327 int kvm_dirty_pages_log_enable_all(kvm_context_t kvm)
329 if (kvm->dirty_pages_log_all)
330 return 0;
331 kvm->dirty_pages_log_all = 1;
332 return kvm_dirty_pages_log_change_all(kvm, kvm_dirty_pages_log_enable_slot);
336 * Enable dirty page logging only for memory regions that were created with
337 * dirty logging enabled (disable for all other memory regions).
339 int kvm_dirty_pages_log_reset(kvm_context_t kvm)
341 if (!kvm->dirty_pages_log_all)
342 return 0;
343 kvm->dirty_pages_log_all = 0;
344 return kvm_dirty_pages_log_change_all(kvm,
345 kvm_dirty_pages_log_disable_slot);
349 static int kvm_create_context(void);
351 int kvm_init(int smp_cpus)
353 int fd;
354 int r, gsi_count;
357 fd = open("/dev/kvm", O_RDWR);
358 if (fd == -1) {
359 perror("open /dev/kvm");
360 return -1;
362 r = ioctl(fd, KVM_GET_API_VERSION, 0);
363 if (r == -1) {
364 fprintf(stderr,
365 "kvm kernel version too old: "
366 "KVM_GET_API_VERSION ioctl not supported\n");
367 goto out_close;
369 if (r < EXPECTED_KVM_API_VERSION) {
370 fprintf(stderr, "kvm kernel version too old: "
371 "We expect API version %d or newer, but got "
372 "version %d\n", EXPECTED_KVM_API_VERSION, r);
373 goto out_close;
375 if (r > EXPECTED_KVM_API_VERSION) {
376 fprintf(stderr, "kvm userspace version too old\n");
377 goto out_close;
379 kvm_abi = r;
380 kvm_page_size = getpagesize();
381 kvm_state = qemu_mallocz(sizeof(*kvm_state));
382 kvm_context = &kvm_state->kvm_context;
384 kvm_state->fd = fd;
385 kvm_state->vmfd = -1;
386 kvm_context->opaque = cpu_single_env;
387 kvm_context->dirty_pages_log_all = 0;
388 kvm_context->no_irqchip_creation = 0;
389 kvm_context->no_pit_creation = 0;
391 #ifdef KVM_CAP_SET_GUEST_DEBUG
392 QTAILQ_INIT(&kvm_state->kvm_sw_breakpoints);
393 #endif
395 gsi_count = kvm_get_gsi_count(kvm_context);
396 if (gsi_count > 0) {
397 int gsi_bits, i;
399 /* Round up so we can search ints using ffs */
400 gsi_bits = ALIGN(gsi_count, 32);
401 kvm_context->used_gsi_bitmap = qemu_mallocz(gsi_bits / 8);
402 kvm_context->max_gsi = gsi_bits;
404 /* Mark any over-allocated bits as already in use */
405 for (i = gsi_count; i < gsi_bits; i++)
406 set_gsi(kvm_context, i);
409 pthread_mutex_lock(&qemu_mutex);
410 return kvm_create_context();
412 out_close:
413 close(fd);
414 return -1;
417 static void kvm_finalize(KVMState *s)
419 /* FIXME
420 if (kvm->vcpu_fd[0] != -1)
421 close(kvm->vcpu_fd[0]);
422 if (kvm->vm_fd != -1)
423 close(kvm->vm_fd);
425 close(s->fd);
426 free(s);
429 void kvm_disable_irqchip_creation(kvm_context_t kvm)
431 kvm->no_irqchip_creation = 1;
434 void kvm_disable_pit_creation(kvm_context_t kvm)
436 kvm->no_pit_creation = 1;
439 kvm_vcpu_context_t kvm_create_vcpu(CPUState *env, int id)
441 long mmap_size;
442 int r;
443 kvm_vcpu_context_t vcpu_ctx = qemu_malloc(sizeof(struct kvm_vcpu_context));
445 r = kvm_vm_ioctl(kvm_state, KVM_CREATE_VCPU, id);
446 if (r < 0) {
447 fprintf(stderr, "kvm_create_vcpu: %m\n");
448 goto err;
450 vcpu_ctx->fd = r;
452 env->kvm_fd = r;
453 env->kvm_state = kvm_state;
455 mmap_size = kvm_ioctl(kvm_state, KVM_GET_VCPU_MMAP_SIZE, 0);
456 if (mmap_size < 0) {
457 fprintf(stderr, "get vcpu mmap size: %m\n");
458 goto err_fd;
460 env->kvm_run =
461 mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, vcpu_ctx->fd,
463 if (env->kvm_run == MAP_FAILED) {
464 fprintf(stderr, "mmap vcpu area: %m\n");
465 goto err_fd;
468 return vcpu_ctx;
469 err_fd:
470 close(vcpu_ctx->fd);
471 err:
472 free(vcpu_ctx);
473 return NULL;
476 static int kvm_set_boot_vcpu_id(kvm_context_t kvm, uint32_t id)
478 #ifdef KVM_CAP_SET_BOOT_CPU_ID
479 int r = kvm_ioctl(kvm_state, KVM_CHECK_EXTENSION, KVM_CAP_SET_BOOT_CPU_ID);
480 if (r > 0)
481 return kvm_vm_ioctl(kvm_state, KVM_SET_BOOT_CPU_ID, id);
482 return -ENOSYS;
483 #else
484 return -ENOSYS;
485 #endif
488 int kvm_create_vm(kvm_context_t kvm)
490 int fd;
491 #ifdef KVM_CAP_IRQ_ROUTING
492 kvm->irq_routes = qemu_mallocz(sizeof(*kvm->irq_routes));
493 kvm->nr_allocated_irq_routes = 0;
494 #endif
496 fd = kvm_ioctl(kvm_state, KVM_CREATE_VM, 0);
497 if (fd < 0) {
498 fprintf(stderr, "kvm_create_vm: %m\n");
499 return -1;
501 kvm_state->vmfd = fd;
502 return 0;
505 static int kvm_create_default_phys_mem(kvm_context_t kvm,
506 unsigned long phys_mem_bytes,
507 void **vm_mem)
509 #ifdef KVM_CAP_USER_MEMORY
510 int r = kvm_ioctl(kvm_state, KVM_CHECK_EXTENSION, KVM_CAP_USER_MEMORY);
511 if (r > 0)
512 return 0;
513 fprintf(stderr,
514 "Hypervisor too old: KVM_CAP_USER_MEMORY extension not supported\n");
515 #else
516 #error Hypervisor too old: KVM_CAP_USER_MEMORY extension not supported
517 #endif
518 return -1;
521 void kvm_create_irqchip(kvm_context_t kvm)
523 int r;
525 kvm->irqchip_in_kernel = 0;
526 #ifdef KVM_CAP_IRQCHIP
527 if (!kvm->no_irqchip_creation) {
528 r = kvm_ioctl(kvm_state, KVM_CHECK_EXTENSION, KVM_CAP_IRQCHIP);
529 if (r > 0) { /* kernel irqchip supported */
530 r = kvm_vm_ioctl(kvm_state, KVM_CREATE_IRQCHIP);
531 if (r >= 0) {
532 kvm->irqchip_inject_ioctl = KVM_IRQ_LINE;
533 #if defined(KVM_CAP_IRQ_INJECT_STATUS) && defined(KVM_IRQ_LINE_STATUS)
534 r = kvm_ioctl(kvm_state, KVM_CHECK_EXTENSION,
535 KVM_CAP_IRQ_INJECT_STATUS);
536 if (r > 0)
537 kvm->irqchip_inject_ioctl = KVM_IRQ_LINE_STATUS;
538 #endif
539 kvm->irqchip_in_kernel = 1;
540 } else
541 fprintf(stderr, "Create kernel PIC irqchip failed\n");
544 #endif
545 kvm_state->irqchip_in_kernel = kvm->irqchip_in_kernel;
548 int kvm_create(kvm_context_t kvm, unsigned long phys_mem_bytes, void **vm_mem)
550 int r;
552 r = kvm_create_vm(kvm);
553 if (r < 0)
554 return r;
555 r = kvm_arch_create(kvm, phys_mem_bytes, vm_mem);
556 if (r < 0)
557 return r;
558 init_slots();
559 r = kvm_create_default_phys_mem(kvm, phys_mem_bytes, vm_mem);
560 if (r < 0)
561 return r;
562 kvm_create_irqchip(kvm);
564 return 0;
568 int kvm_register_phys_mem(kvm_context_t kvm,
569 unsigned long phys_start, void *userspace_addr,
570 unsigned long len, int log)
573 struct kvm_userspace_memory_region memory = {
574 .memory_size = len,
575 .guest_phys_addr = phys_start,
576 .userspace_addr = (unsigned long) (uintptr_t) userspace_addr,
577 .flags = log ? KVM_MEM_LOG_DIRTY_PAGES : 0,
579 int r;
581 memory.slot = get_free_slot(kvm);
582 DPRINTF
583 ("memory: gpa: %llx, size: %llx, uaddr: %llx, slot: %x, flags: %lx\n",
584 memory.guest_phys_addr, memory.memory_size, memory.userspace_addr,
585 memory.slot, memory.flags);
586 r = kvm_vm_ioctl(kvm_state, KVM_SET_USER_MEMORY_REGION, &memory);
587 if (r < 0) {
588 fprintf(stderr, "create_userspace_phys_mem: %s\n", strerror(-r));
589 return -1;
591 register_slot(memory.slot, memory.guest_phys_addr, memory.memory_size,
592 memory.userspace_addr, memory.flags);
593 return 0;
597 /* destroy/free a whole slot.
598 * phys_start, len and slot are the params passed to kvm_create_phys_mem()
600 void kvm_destroy_phys_mem(kvm_context_t kvm, unsigned long phys_start,
601 unsigned long len)
603 int slot;
604 int r;
605 struct kvm_userspace_memory_region memory = {
606 .memory_size = 0,
607 .guest_phys_addr = phys_start,
608 .userspace_addr = 0,
609 .flags = 0,
612 slot = get_slot(phys_start);
614 if ((slot >= KVM_MAX_NUM_MEM_REGIONS) || (slot == -1)) {
615 fprintf(stderr, "BUG: %s: invalid parameters (slot=%d)\n", __FUNCTION__,
616 slot);
617 return;
619 if (phys_start != slots[slot].phys_addr) {
620 fprintf(stderr,
621 "WARNING: %s: phys_start is 0x%lx expecting 0x%lx\n",
622 __FUNCTION__, phys_start, slots[slot].phys_addr);
623 phys_start = slots[slot].phys_addr;
626 memory.slot = slot;
627 DPRINTF("slot %d start %llx len %llx flags %x\n",
628 memory.slot, memory.guest_phys_addr, memory.memory_size,
629 memory.flags);
630 r = kvm_vm_ioctl(kvm_state, KVM_SET_USER_MEMORY_REGION, &memory);
631 if (r < 0) {
632 fprintf(stderr, "destroy_userspace_phys_mem: %s", strerror(-r));
633 return;
636 free_slot(memory.slot);
639 void kvm_unregister_memory_area(kvm_context_t kvm, uint64_t phys_addr,
640 unsigned long size)
643 int slot = get_container_slot(phys_addr, size);
645 if (slot != -1) {
646 DPRINTF("Unregistering memory region %llx (%lx)\n", phys_addr, size);
647 kvm_destroy_phys_mem(kvm, phys_addr, size);
648 return;
652 static int kvm_get_map(kvm_context_t kvm, int ioctl_num, int slot, void *buf)
654 int r;
655 struct kvm_dirty_log log = {
656 .slot = slot,
659 log.dirty_bitmap = buf;
661 r = kvm_vm_ioctl(kvm_state, ioctl_num, &log);
662 if (r < 0)
663 return r;
664 return 0;
667 int kvm_get_dirty_pages(kvm_context_t kvm, unsigned long phys_addr, void *buf)
669 int slot;
671 slot = get_slot(phys_addr);
672 return kvm_get_map(kvm, KVM_GET_DIRTY_LOG, slot, buf);
675 int kvm_get_dirty_pages_range(kvm_context_t kvm, unsigned long phys_addr,
676 unsigned long len, void *opaque,
677 int (*cb)(unsigned long start,
678 unsigned long len, void *bitmap,
679 void *opaque))
681 int i;
682 int r;
683 unsigned long end_addr = phys_addr + len;
684 void *buf;
686 for (i = 0; i < KVM_MAX_NUM_MEM_REGIONS; ++i) {
687 if ((slots[i].len && (uint64_t) slots[i].phys_addr >= phys_addr)
688 && ((uint64_t) slots[i].phys_addr + slots[i].len <= end_addr)) {
689 buf = qemu_malloc(BITMAP_SIZE(slots[i].len));
690 r = kvm_get_map(kvm, KVM_GET_DIRTY_LOG, i, buf);
691 if (r) {
692 qemu_free(buf);
693 return r;
695 r = cb(slots[i].phys_addr, slots[i].len, buf, opaque);
696 qemu_free(buf);
697 if (r)
698 return r;
701 return 0;
704 #ifdef KVM_CAP_IRQCHIP
706 int kvm_set_irq_level(kvm_context_t kvm, int irq, int level, int *status)
708 struct kvm_irq_level event;
709 int r;
711 if (!kvm->irqchip_in_kernel)
712 return 0;
713 event.level = level;
714 event.irq = irq;
715 r = kvm_vm_ioctl(kvm_state, kvm->irqchip_inject_ioctl, &event);
716 if (r < 0)
717 perror("kvm_set_irq_level");
719 if (status) {
720 #ifdef KVM_CAP_IRQ_INJECT_STATUS
721 *status =
722 (kvm->irqchip_inject_ioctl == KVM_IRQ_LINE) ? 1 : event.status;
723 #else
724 *status = 1;
725 #endif
728 return 1;
731 int kvm_get_irqchip(kvm_context_t kvm, struct kvm_irqchip *chip)
733 int r;
735 if (!kvm->irqchip_in_kernel)
736 return 0;
737 r = kvm_vm_ioctl(kvm_state, KVM_GET_IRQCHIP, chip);
738 if (r < 0) {
739 perror("kvm_get_irqchip\n");
741 return r;
744 int kvm_set_irqchip(kvm_context_t kvm, struct kvm_irqchip *chip)
746 int r;
748 if (!kvm->irqchip_in_kernel)
749 return 0;
750 r = kvm_vm_ioctl(kvm_state, KVM_SET_IRQCHIP, chip);
751 if (r < 0) {
752 perror("kvm_set_irqchip\n");
754 return r;
757 #endif
759 static int handle_io(CPUState *env)
761 struct kvm_run *run = env->kvm_run;
762 uint16_t addr = run->io.port;
763 int i;
764 void *p = (void *) run + run->io.data_offset;
766 for (i = 0; i < run->io.count; ++i) {
767 switch (run->io.direction) {
768 case KVM_EXIT_IO_IN:
769 switch (run->io.size) {
770 case 1:
771 *(uint8_t *) p = cpu_inb(addr);
772 break;
773 case 2:
774 *(uint16_t *) p = cpu_inw(addr);
775 break;
776 case 4:
777 *(uint32_t *) p = cpu_inl(addr);
778 break;
779 default:
780 fprintf(stderr, "bad I/O size %d\n", run->io.size);
781 return -EMSGSIZE;
783 break;
784 case KVM_EXIT_IO_OUT:
785 switch (run->io.size) {
786 case 1:
787 cpu_outb(addr, *(uint8_t *) p);
788 break;
789 case 2:
790 cpu_outw(addr, *(uint16_t *) p);
791 break;
792 case 4:
793 cpu_outl(addr, *(uint32_t *) p);
794 break;
795 default:
796 fprintf(stderr, "bad I/O size %d\n", run->io.size);
797 return -EMSGSIZE;
799 break;
800 default:
801 fprintf(stderr, "bad I/O direction %d\n", run->io.direction);
802 return -EPROTO;
805 p += run->io.size;
808 return 0;
811 static int handle_debug(CPUState *env)
813 #ifdef KVM_CAP_SET_GUEST_DEBUG
814 struct kvm_run *run = env->kvm_run;
816 return kvm_debug(env, &run->debug.arch);
817 #else
818 return 0;
819 #endif
822 int kvm_get_regs(kvm_vcpu_context_t vcpu, struct kvm_regs *regs)
824 return ioctl(vcpu->fd, KVM_GET_REGS, regs);
827 int kvm_set_regs(kvm_vcpu_context_t vcpu, struct kvm_regs *regs)
829 return ioctl(vcpu->fd, KVM_SET_REGS, regs);
832 int kvm_get_fpu(kvm_vcpu_context_t vcpu, struct kvm_fpu *fpu)
834 return ioctl(vcpu->fd, KVM_GET_FPU, fpu);
837 int kvm_set_fpu(kvm_vcpu_context_t vcpu, struct kvm_fpu *fpu)
839 return ioctl(vcpu->fd, KVM_SET_FPU, fpu);
842 int kvm_get_sregs(kvm_vcpu_context_t vcpu, struct kvm_sregs *sregs)
844 return ioctl(vcpu->fd, KVM_GET_SREGS, sregs);
847 int kvm_set_sregs(kvm_vcpu_context_t vcpu, struct kvm_sregs *sregs)
849 return ioctl(vcpu->fd, KVM_SET_SREGS, sregs);
852 #ifdef KVM_CAP_MP_STATE
853 int kvm_get_mpstate(kvm_vcpu_context_t vcpu, struct kvm_mp_state *mp_state)
855 int r;
857 r = kvm_ioctl(kvm_state, KVM_CHECK_EXTENSION, KVM_CAP_MP_STATE);
858 if (r > 0)
859 return ioctl(vcpu->fd, KVM_GET_MP_STATE, mp_state);
860 return -ENOSYS;
863 int kvm_set_mpstate(kvm_vcpu_context_t vcpu, struct kvm_mp_state *mp_state)
865 int r;
867 r = kvm_ioctl(kvm_state, KVM_CHECK_EXTENSION, KVM_CAP_MP_STATE);
868 if (r > 0)
869 return ioctl(vcpu->fd, KVM_SET_MP_STATE, mp_state);
870 return -ENOSYS;
872 #endif
874 static int handle_mmio(CPUState *env)
876 unsigned long addr = env->kvm_run->mmio.phys_addr;
877 struct kvm_run *kvm_run = env->kvm_run;
878 void *data = kvm_run->mmio.data;
880 /* hack: Red Hat 7.1 generates these weird accesses. */
881 if ((addr > 0xa0000 - 4 && addr <= 0xa0000) && kvm_run->mmio.len == 3)
882 return 0;
884 cpu_physical_memory_rw(addr, data, kvm_run->mmio.len, kvm_run->mmio.is_write);
885 return 0;
888 int handle_io_window(kvm_context_t kvm)
890 return 1;
893 int handle_shutdown(kvm_context_t kvm, CPUState *env)
895 /* stop the current vcpu from going back to guest mode */
896 env->stopped = 1;
898 qemu_system_reset_request();
899 return 1;
902 static inline void push_nmi(kvm_context_t kvm)
904 #ifdef KVM_CAP_USER_NMI
905 kvm_arch_push_nmi(kvm->opaque);
906 #endif /* KVM_CAP_USER_NMI */
909 void post_kvm_run(kvm_context_t kvm, CPUState *env)
911 pthread_mutex_lock(&qemu_mutex);
912 kvm_arch_post_kvm_run(kvm->opaque, env);
915 int pre_kvm_run(kvm_context_t kvm, CPUState *env)
917 kvm_arch_pre_kvm_run(kvm->opaque, env);
919 pthread_mutex_unlock(&qemu_mutex);
920 return 0;
923 int kvm_get_interrupt_flag(CPUState *env)
925 return env->kvm_run->if_flag;
928 int kvm_is_ready_for_interrupt_injection(CPUState *env)
930 return env->kvm_run->ready_for_interrupt_injection;
933 int kvm_run(kvm_vcpu_context_t vcpu, void *env)
935 int r;
936 int fd = vcpu->fd;
937 CPUState *_env = env;
938 kvm_context_t kvm = &_env->kvm_state->kvm_context;
939 struct kvm_run *run = _env->kvm_run;
941 again:
942 push_nmi(kvm);
943 #if !defined(__s390__)
944 if (!kvm->irqchip_in_kernel)
945 run->request_interrupt_window = kvm_arch_try_push_interrupts(env);
946 #endif
948 if (_env->kvm_cpu_state.regs_modified) {
949 kvm_arch_put_registers(_env);
950 _env->kvm_cpu_state.regs_modified = 0;
953 r = pre_kvm_run(kvm, env);
954 if (r)
955 return r;
956 r = ioctl(fd, KVM_RUN, 0);
958 if (r == -1 && errno != EINTR && errno != EAGAIN) {
959 r = -errno;
960 post_kvm_run(kvm, env);
961 fprintf(stderr, "kvm_run: %s\n", strerror(-r));
962 return r;
965 post_kvm_run(kvm, env);
967 #if defined(KVM_CAP_COALESCED_MMIO)
968 if (kvm_state->coalesced_mmio) {
969 struct kvm_coalesced_mmio_ring *ring =
970 (void *) run + kvm_state->coalesced_mmio * PAGE_SIZE;
971 while (ring->first != ring->last) {
972 cpu_physical_memory_rw(ring->coalesced_mmio[ring->first].phys_addr,
973 &ring->coalesced_mmio[ring->first].data[0],
974 ring->coalesced_mmio[ring->first].len, 1);
975 smp_wmb();
976 ring->first = (ring->first + 1) % KVM_COALESCED_MMIO_MAX;
979 #endif
981 #if !defined(__s390__)
982 if (r == -1) {
983 r = handle_io_window(kvm);
984 goto more;
986 #endif
987 if (1) {
988 switch (run->exit_reason) {
989 case KVM_EXIT_UNKNOWN:
990 r = handle_unhandled(run->hw.hardware_exit_reason);
991 break;
992 case KVM_EXIT_FAIL_ENTRY:
993 r = handle_unhandled(run->fail_entry.hardware_entry_failure_reason);
994 break;
995 case KVM_EXIT_EXCEPTION:
996 fprintf(stderr, "exception %d (%x)\n", run->ex.exception,
997 run->ex.error_code);
998 kvm_show_regs(vcpu);
999 kvm_show_code(vcpu);
1000 abort();
1001 break;
1002 case KVM_EXIT_IO:
1003 r = handle_io(env);
1004 break;
1005 case KVM_EXIT_DEBUG:
1006 r = handle_debug(env);
1007 break;
1008 case KVM_EXIT_MMIO:
1009 r = handle_mmio(env);
1010 break;
1011 case KVM_EXIT_HLT:
1012 r = kvm_arch_halt(vcpu);
1013 break;
1014 case KVM_EXIT_IRQ_WINDOW_OPEN:
1015 break;
1016 case KVM_EXIT_SHUTDOWN:
1017 r = handle_shutdown(kvm, env);
1018 break;
1019 #if defined(__s390__)
1020 case KVM_EXIT_S390_SIEIC:
1021 r = kvm_s390_handle_intercept(kvm, vcpu, run);
1022 break;
1023 case KVM_EXIT_S390_RESET:
1024 r = kvm_s390_handle_reset(kvm, vcpu, run);
1025 break;
1026 #endif
1027 case KVM_EXIT_INTERNAL_ERROR:
1028 fprintf(stderr, "KVM internal error. Suberror: %d\n",
1029 run->internal.suberror);
1030 kvm_show_regs(vcpu);
1031 if (run->internal.suberror == KVM_INTERNAL_ERROR_EMULATION)
1032 fprintf(stderr, "emulation failure, check dmesg for details\n");
1033 abort();
1034 break;
1035 default:
1036 if (kvm_arch_run(env)) {
1037 fprintf(stderr, "unhandled vm exit: 0x%x\n", run->exit_reason);
1038 kvm_show_regs(vcpu);
1039 abort();
1041 break;
1044 more:
1045 if (!r)
1046 goto again;
1047 return r;
1050 int kvm_inject_irq(kvm_vcpu_context_t vcpu, unsigned irq)
1052 struct kvm_interrupt intr;
1054 intr.irq = irq;
1055 return ioctl(vcpu->fd, KVM_INTERRUPT, &intr);
1058 #ifdef KVM_CAP_SET_GUEST_DEBUG
1059 int kvm_set_guest_debug(kvm_vcpu_context_t vcpu, struct kvm_guest_debug *dbg)
1061 return ioctl(vcpu->fd, KVM_SET_GUEST_DEBUG, dbg);
1063 #endif
1065 int kvm_set_signal_mask(kvm_vcpu_context_t vcpu, const sigset_t *sigset)
1067 struct kvm_signal_mask *sigmask;
1068 int r;
1070 if (!sigset) {
1071 r = ioctl(vcpu->fd, KVM_SET_SIGNAL_MASK, NULL);
1072 if (r == -1)
1073 r = -errno;
1074 return r;
1076 sigmask = qemu_malloc(sizeof(*sigmask) + sizeof(*sigset));
1078 sigmask->len = 8;
1079 memcpy(sigmask->sigset, sigset, sizeof(*sigset));
1080 r = ioctl(vcpu->fd, KVM_SET_SIGNAL_MASK, sigmask);
1081 if (r == -1)
1082 r = -errno;
1083 free(sigmask);
1084 return r;
1087 int kvm_pit_in_kernel(kvm_context_t kvm)
1089 return kvm->pit_in_kernel;
1092 int kvm_inject_nmi(kvm_vcpu_context_t vcpu)
1094 #ifdef KVM_CAP_USER_NMI
1095 return ioctl(vcpu->fd, KVM_NMI);
1096 #else
1097 return -ENOSYS;
1098 #endif
1101 int kvm_init_coalesced_mmio(kvm_context_t kvm)
1103 int r = 0;
1104 kvm_state->coalesced_mmio = 0;
1105 #ifdef KVM_CAP_COALESCED_MMIO
1106 r = kvm_ioctl(kvm_state, KVM_CHECK_EXTENSION, KVM_CAP_COALESCED_MMIO);
1107 if (r > 0) {
1108 kvm_state->coalesced_mmio = r;
1109 return 0;
1111 #endif
1112 return r;
1115 #ifdef KVM_CAP_DEVICE_ASSIGNMENT
1116 int kvm_assign_pci_device(kvm_context_t kvm,
1117 struct kvm_assigned_pci_dev *assigned_dev)
1119 return kvm_vm_ioctl(kvm_state, KVM_ASSIGN_PCI_DEVICE, assigned_dev);
1122 static int kvm_old_assign_irq(kvm_context_t kvm,
1123 struct kvm_assigned_irq *assigned_irq)
1125 return kvm_vm_ioctl(kvm_state, KVM_ASSIGN_IRQ, assigned_irq);
1128 #ifdef KVM_CAP_ASSIGN_DEV_IRQ
1129 int kvm_assign_irq(kvm_context_t kvm, struct kvm_assigned_irq *assigned_irq)
1131 int ret;
1133 ret = kvm_ioctl(kvm_state, KVM_CHECK_EXTENSION, KVM_CAP_ASSIGN_DEV_IRQ);
1134 if (ret > 0) {
1135 return kvm_vm_ioctl(kvm_state, KVM_ASSIGN_DEV_IRQ, assigned_irq);
1138 return kvm_old_assign_irq(kvm, assigned_irq);
1141 int kvm_deassign_irq(kvm_context_t kvm, struct kvm_assigned_irq *assigned_irq)
1143 return kvm_vm_ioctl(kvm_state, KVM_DEASSIGN_DEV_IRQ, assigned_irq);
1145 #else
1146 int kvm_assign_irq(kvm_context_t kvm, struct kvm_assigned_irq *assigned_irq)
1148 return kvm_old_assign_irq(kvm, assigned_irq);
1150 #endif
1151 #endif
1153 #ifdef KVM_CAP_DEVICE_DEASSIGNMENT
1154 int kvm_deassign_pci_device(kvm_context_t kvm,
1155 struct kvm_assigned_pci_dev *assigned_dev)
1157 return kvm_vm_ioctl(kvm_state, KVM_DEASSIGN_PCI_DEVICE, assigned_dev);
1159 #endif
1161 int kvm_destroy_memory_region_works(kvm_context_t kvm)
1163 int ret = 0;
1165 #ifdef KVM_CAP_DESTROY_MEMORY_REGION_WORKS
1166 ret =
1167 kvm_ioctl(kvm_state, KVM_CHECK_EXTENSION,
1168 KVM_CAP_DESTROY_MEMORY_REGION_WORKS);
1169 if (ret <= 0)
1170 ret = 0;
1171 #endif
1172 return ret;
1175 int kvm_reinject_control(kvm_context_t kvm, int pit_reinject)
1177 #ifdef KVM_CAP_REINJECT_CONTROL
1178 int r;
1179 struct kvm_reinject_control control;
1181 control.pit_reinject = pit_reinject;
1183 r = kvm_ioctl(kvm_state, KVM_CHECK_EXTENSION, KVM_CAP_REINJECT_CONTROL);
1184 if (r > 0) {
1185 return kvm_vm_ioctl(kvm_state, KVM_REINJECT_CONTROL, &control);
1187 #endif
1188 return -ENOSYS;
1191 int kvm_has_gsi_routing(kvm_context_t kvm)
1193 int r = 0;
1195 #ifdef KVM_CAP_IRQ_ROUTING
1196 r = kvm_check_extension(kvm_state, KVM_CAP_IRQ_ROUTING);
1197 #endif
1198 return r;
1201 int kvm_get_gsi_count(kvm_context_t kvm)
1203 #ifdef KVM_CAP_IRQ_ROUTING
1204 return kvm_check_extension(kvm_state, KVM_CAP_IRQ_ROUTING);
1205 #else
1206 return -EINVAL;
1207 #endif
1210 int kvm_clear_gsi_routes(kvm_context_t kvm)
1212 #ifdef KVM_CAP_IRQ_ROUTING
1213 kvm->irq_routes->nr = 0;
1214 return 0;
1215 #else
1216 return -EINVAL;
1217 #endif
1220 int kvm_add_routing_entry(kvm_context_t kvm,
1221 struct kvm_irq_routing_entry *entry)
1223 #ifdef KVM_CAP_IRQ_ROUTING
1224 struct kvm_irq_routing *z;
1225 struct kvm_irq_routing_entry *new;
1226 int n, size;
1228 if (kvm->irq_routes->nr == kvm->nr_allocated_irq_routes) {
1229 n = kvm->nr_allocated_irq_routes * 2;
1230 if (n < 64)
1231 n = 64;
1232 size = sizeof(struct kvm_irq_routing);
1233 size += n * sizeof(*new);
1234 z = realloc(kvm->irq_routes, size);
1235 if (!z)
1236 return -ENOMEM;
1237 kvm->nr_allocated_irq_routes = n;
1238 kvm->irq_routes = z;
1240 n = kvm->irq_routes->nr++;
1241 new = &kvm->irq_routes->entries[n];
1242 memset(new, 0, sizeof(*new));
1243 new->gsi = entry->gsi;
1244 new->type = entry->type;
1245 new->flags = entry->flags;
1246 new->u = entry->u;
1248 set_gsi(kvm, entry->gsi);
1250 return 0;
1251 #else
1252 return -ENOSYS;
1253 #endif
1256 int kvm_add_irq_route(kvm_context_t kvm, int gsi, int irqchip, int pin)
1258 #ifdef KVM_CAP_IRQ_ROUTING
1259 struct kvm_irq_routing_entry e;
1261 e.gsi = gsi;
1262 e.type = KVM_IRQ_ROUTING_IRQCHIP;
1263 e.flags = 0;
1264 e.u.irqchip.irqchip = irqchip;
1265 e.u.irqchip.pin = pin;
1266 return kvm_add_routing_entry(kvm, &e);
1267 #else
1268 return -ENOSYS;
1269 #endif
1272 int kvm_del_routing_entry(kvm_context_t kvm,
1273 struct kvm_irq_routing_entry *entry)
1275 #ifdef KVM_CAP_IRQ_ROUTING
1276 struct kvm_irq_routing_entry *e, *p;
1277 int i, gsi, found = 0;
1279 gsi = entry->gsi;
1281 for (i = 0; i < kvm->irq_routes->nr; ++i) {
1282 e = &kvm->irq_routes->entries[i];
1283 if (e->type == entry->type && e->gsi == gsi) {
1284 switch (e->type) {
1285 case KVM_IRQ_ROUTING_IRQCHIP:{
1286 if (e->u.irqchip.irqchip ==
1287 entry->u.irqchip.irqchip
1288 && e->u.irqchip.pin == entry->u.irqchip.pin) {
1289 p = &kvm->irq_routes->entries[--kvm->irq_routes->nr];
1290 *e = *p;
1291 found = 1;
1293 break;
1295 case KVM_IRQ_ROUTING_MSI:{
1296 if (e->u.msi.address_lo ==
1297 entry->u.msi.address_lo
1298 && e->u.msi.address_hi ==
1299 entry->u.msi.address_hi
1300 && e->u.msi.data == entry->u.msi.data) {
1301 p = &kvm->irq_routes->entries[--kvm->irq_routes->nr];
1302 *e = *p;
1303 found = 1;
1305 break;
1307 default:
1308 break;
1310 if (found) {
1311 /* If there are no other users of this GSI
1312 * mark it available in the bitmap */
1313 for (i = 0; i < kvm->irq_routes->nr; i++) {
1314 e = &kvm->irq_routes->entries[i];
1315 if (e->gsi == gsi)
1316 break;
1318 if (i == kvm->irq_routes->nr)
1319 clear_gsi(kvm, gsi);
1321 return 0;
1325 return -ESRCH;
1326 #else
1327 return -ENOSYS;
1328 #endif
1331 int kvm_update_routing_entry(kvm_context_t kvm,
1332 struct kvm_irq_routing_entry *entry,
1333 struct kvm_irq_routing_entry *newentry)
1335 #ifdef KVM_CAP_IRQ_ROUTING
1336 struct kvm_irq_routing_entry *e;
1337 int i;
1339 if (entry->gsi != newentry->gsi || entry->type != newentry->type) {
1340 return -EINVAL;
1343 for (i = 0; i < kvm->irq_routes->nr; ++i) {
1344 e = &kvm->irq_routes->entries[i];
1345 if (e->type != entry->type || e->gsi != entry->gsi) {
1346 continue;
1348 switch (e->type) {
1349 case KVM_IRQ_ROUTING_IRQCHIP:
1350 if (e->u.irqchip.irqchip == entry->u.irqchip.irqchip &&
1351 e->u.irqchip.pin == entry->u.irqchip.pin) {
1352 memcpy(&e->u.irqchip, &newentry->u.irqchip,
1353 sizeof e->u.irqchip);
1354 return 0;
1356 break;
1357 case KVM_IRQ_ROUTING_MSI:
1358 if (e->u.msi.address_lo == entry->u.msi.address_lo &&
1359 e->u.msi.address_hi == entry->u.msi.address_hi &&
1360 e->u.msi.data == entry->u.msi.data) {
1361 memcpy(&e->u.msi, &newentry->u.msi, sizeof e->u.msi);
1362 return 0;
1364 break;
1365 default:
1366 break;
1369 return -ESRCH;
1370 #else
1371 return -ENOSYS;
1372 #endif
1375 int kvm_del_irq_route(kvm_context_t kvm, int gsi, int irqchip, int pin)
1377 #ifdef KVM_CAP_IRQ_ROUTING
1378 struct kvm_irq_routing_entry e;
1380 e.gsi = gsi;
1381 e.type = KVM_IRQ_ROUTING_IRQCHIP;
1382 e.flags = 0;
1383 e.u.irqchip.irqchip = irqchip;
1384 e.u.irqchip.pin = pin;
1385 return kvm_del_routing_entry(kvm, &e);
1386 #else
1387 return -ENOSYS;
1388 #endif
1391 int kvm_commit_irq_routes(kvm_context_t kvm)
1393 #ifdef KVM_CAP_IRQ_ROUTING
1394 kvm->irq_routes->flags = 0;
1395 return kvm_vm_ioctl(kvm_state, KVM_SET_GSI_ROUTING, kvm->irq_routes);
1396 #else
1397 return -ENOSYS;
1398 #endif
1401 int kvm_get_irq_route_gsi(kvm_context_t kvm)
1403 int i, bit;
1404 uint32_t *buf = kvm->used_gsi_bitmap;
1406 /* Return the lowest unused GSI in the bitmap */
1407 for (i = 0; i < kvm->max_gsi / 32; i++) {
1408 bit = ffs(~buf[i]);
1409 if (!bit)
1410 continue;
1412 return bit - 1 + i * 32;
1415 return -ENOSPC;
1418 #ifdef KVM_CAP_DEVICE_MSIX
1419 int kvm_assign_set_msix_nr(kvm_context_t kvm,
1420 struct kvm_assigned_msix_nr *msix_nr)
1422 return kvm_vm_ioctl(kvm_state, KVM_ASSIGN_SET_MSIX_NR, msix_nr);
1425 int kvm_assign_set_msix_entry(kvm_context_t kvm,
1426 struct kvm_assigned_msix_entry *entry)
1428 return kvm_vm_ioctl(kvm_state, KVM_ASSIGN_SET_MSIX_ENTRY, entry);
1430 #endif
1432 #if defined(KVM_CAP_IRQFD) && defined(CONFIG_EVENTFD)
1434 #include <sys/eventfd.h>
1436 static int _kvm_irqfd(kvm_context_t kvm, int fd, int gsi, int flags)
1438 struct kvm_irqfd data = {
1439 .fd = fd,
1440 .gsi = gsi,
1441 .flags = flags,
1444 return kvm_vm_ioctl(kvm_state, KVM_IRQFD, &data);
1447 int kvm_irqfd(kvm_context_t kvm, int gsi, int flags)
1449 int r;
1450 int fd;
1452 if (!kvm_check_extension(kvm_state, KVM_CAP_IRQFD))
1453 return -ENOENT;
1455 fd = eventfd(0, 0);
1456 if (fd < 0)
1457 return -errno;
1459 r = _kvm_irqfd(kvm, fd, gsi, 0);
1460 if (r < 0) {
1461 close(fd);
1462 return -errno;
1465 return fd;
1468 #else /* KVM_CAP_IRQFD */
1470 int kvm_irqfd(kvm_context_t kvm, int gsi, int flags)
1472 return -ENOSYS;
1475 #endif /* KVM_CAP_IRQFD */
1476 static inline unsigned long kvm_get_thread_id(void)
1478 return syscall(SYS_gettid);
1481 static void qemu_cond_wait(pthread_cond_t *cond)
1483 CPUState *env = cpu_single_env;
1485 pthread_cond_wait(cond, &qemu_mutex);
1486 cpu_single_env = env;
1489 static void sig_ipi_handler(int n)
1493 static void hardware_memory_error(void)
1495 fprintf(stderr, "Hardware memory error!\n");
1496 exit(1);
1499 static void sigbus_reraise(void)
1501 sigset_t set;
1502 struct sigaction action;
1504 memset(&action, 0, sizeof(action));
1505 action.sa_handler = SIG_DFL;
1506 if (!sigaction(SIGBUS, &action, NULL)) {
1507 raise(SIGBUS);
1508 sigemptyset(&set);
1509 sigaddset(&set, SIGBUS);
1510 sigprocmask(SIG_UNBLOCK, &set, NULL);
1512 perror("Failed to re-raise SIGBUS!\n");
1513 abort();
1516 static void sigbus_handler(int n, struct qemu_signalfd_siginfo *siginfo,
1517 void *ctx)
1519 #if defined(KVM_CAP_MCE) && defined(TARGET_I386)
1520 if (first_cpu->mcg_cap && siginfo->ssi_addr
1521 && siginfo->ssi_code == BUS_MCEERR_AO) {
1522 uint64_t status;
1523 unsigned long paddr;
1524 CPUState *cenv;
1526 /* Hope we are lucky for AO MCE */
1527 if (do_qemu_ram_addr_from_host((void *)(intptr_t)siginfo->ssi_addr,
1528 &paddr)) {
1529 fprintf(stderr, "Hardware memory error for memory used by "
1530 "QEMU itself instead of guest system!: %llx\n",
1531 (unsigned long long)siginfo->ssi_addr);
1532 return;
1534 status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN
1535 | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S
1536 | 0xc0;
1537 kvm_inject_x86_mce(first_cpu, 9, status,
1538 MCG_STATUS_MCIP | MCG_STATUS_RIPV, paddr,
1539 (MCM_ADDR_PHYS << 6) | 0xc, 1);
1540 for (cenv = first_cpu->next_cpu; cenv != NULL; cenv = cenv->next_cpu)
1541 kvm_inject_x86_mce(cenv, 1, MCI_STATUS_VAL | MCI_STATUS_UC,
1542 MCG_STATUS_MCIP | MCG_STATUS_RIPV, 0, 0, 1);
1543 } else
1544 #endif
1546 if (siginfo->ssi_code == BUS_MCEERR_AO)
1547 return;
1548 else if (siginfo->ssi_code == BUS_MCEERR_AR)
1549 hardware_memory_error();
1550 else
1551 sigbus_reraise();
1555 static void on_vcpu(CPUState *env, void (*func)(void *data), void *data)
1557 struct qemu_work_item wi;
1559 if (env == current_env) {
1560 func(data);
1561 return;
1564 wi.func = func;
1565 wi.data = data;
1566 if (!env->kvm_cpu_state.queued_work_first)
1567 env->kvm_cpu_state.queued_work_first = &wi;
1568 else
1569 env->kvm_cpu_state.queued_work_last->next = &wi;
1570 env->kvm_cpu_state.queued_work_last = &wi;
1571 wi.next = NULL;
1572 wi.done = false;
1574 pthread_kill(env->kvm_cpu_state.thread, SIG_IPI);
1575 while (!wi.done)
1576 qemu_cond_wait(&qemu_work_cond);
1579 void kvm_arch_get_registers(CPUState *env)
1581 kvm_arch_save_regs(env);
1584 static void do_kvm_cpu_synchronize_state(void *_env)
1586 CPUState *env = _env;
1587 if (!env->kvm_cpu_state.regs_modified) {
1588 kvm_arch_get_registers(env);
1589 env->kvm_cpu_state.regs_modified = 1;
1593 void kvm_cpu_synchronize_state(CPUState *env)
1595 if (!env->kvm_cpu_state.regs_modified)
1596 on_vcpu(env, do_kvm_cpu_synchronize_state, env);
1599 static void inject_interrupt(void *data)
1601 cpu_interrupt(current_env, (long) data);
1604 void kvm_inject_interrupt(CPUState *env, int mask)
1606 on_vcpu(env, inject_interrupt, (void *) (long) mask);
1609 void kvm_update_interrupt_request(CPUState *env)
1611 int signal = 0;
1613 if (env) {
1614 if (!current_env || !current_env->created)
1615 signal = 1;
1617 * Testing for created here is really redundant
1619 if (current_env && current_env->created &&
1620 env != current_env && !env->kvm_cpu_state.signalled)
1621 signal = 1;
1623 if (signal) {
1624 env->kvm_cpu_state.signalled = 1;
1625 if (env->kvm_cpu_state.thread)
1626 pthread_kill(env->kvm_cpu_state.thread, SIG_IPI);
1631 static void kvm_do_load_registers(void *_env)
1633 CPUState *env = _env;
1635 kvm_arch_load_regs(env);
1638 void kvm_load_registers(CPUState *env)
1640 if (kvm_enabled() && qemu_system_ready)
1641 on_vcpu(env, kvm_do_load_registers, env);
1644 static void kvm_do_save_registers(void *_env)
1646 CPUState *env = _env;
1648 kvm_arch_save_regs(env);
1651 void kvm_save_registers(CPUState *env)
1653 if (kvm_enabled())
1654 on_vcpu(env, kvm_do_save_registers, env);
1657 static void kvm_do_load_mpstate(void *_env)
1659 CPUState *env = _env;
1661 kvm_arch_load_mpstate(env);
1664 void kvm_load_mpstate(CPUState *env)
1666 if (kvm_enabled() && qemu_system_ready)
1667 on_vcpu(env, kvm_do_load_mpstate, env);
1670 static void kvm_do_save_mpstate(void *_env)
1672 CPUState *env = _env;
1674 kvm_arch_save_mpstate(env);
1675 #ifdef KVM_CAP_MP_STATE
1676 if (kvm_irqchip_in_kernel())
1677 env->halted = (env->mp_state == KVM_MP_STATE_HALTED);
1678 #endif
1681 void kvm_save_mpstate(CPUState *env)
1683 if (kvm_enabled())
1684 on_vcpu(env, kvm_do_save_mpstate, env);
1687 int kvm_cpu_exec(CPUState *env)
1689 int r;
1691 r = kvm_run(env->kvm_cpu_state.vcpu_ctx, env);
1692 if (r < 0) {
1693 printf("kvm_run returned %d\n", r);
1694 vm_stop(0);
1697 return 0;
1700 static int is_cpu_stopped(CPUState *env)
1702 return !vm_running || env->stopped;
1705 static void flush_queued_work(CPUState *env)
1707 struct qemu_work_item *wi;
1709 if (!env->kvm_cpu_state.queued_work_first)
1710 return;
1712 while ((wi = env->kvm_cpu_state.queued_work_first)) {
1713 env->kvm_cpu_state.queued_work_first = wi->next;
1714 wi->func(wi->data);
1715 wi->done = true;
1717 env->kvm_cpu_state.queued_work_last = NULL;
1718 pthread_cond_broadcast(&qemu_work_cond);
1721 static void kvm_on_sigbus(CPUState *env, siginfo_t *siginfo)
1723 #if defined(KVM_CAP_MCE) && defined(TARGET_I386)
1724 struct kvm_x86_mce mce = {
1725 .bank = 9,
1727 unsigned long paddr;
1728 int r;
1730 if (env->mcg_cap && siginfo->si_addr
1731 && (siginfo->si_code == BUS_MCEERR_AR
1732 || siginfo->si_code == BUS_MCEERR_AO)) {
1733 if (siginfo->si_code == BUS_MCEERR_AR) {
1734 /* Fake an Intel architectural Data Load SRAR UCR */
1735 mce.status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN
1736 | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S
1737 | MCI_STATUS_AR | 0x134;
1738 mce.misc = (MCM_ADDR_PHYS << 6) | 0xc;
1739 mce.mcg_status = MCG_STATUS_MCIP | MCG_STATUS_EIPV;
1740 } else {
1741 /* Fake an Intel architectural Memory scrubbing UCR */
1742 mce.status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN
1743 | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S
1744 | 0xc0;
1745 mce.misc = (MCM_ADDR_PHYS << 6) | 0xc;
1746 mce.mcg_status = MCG_STATUS_MCIP | MCG_STATUS_RIPV;
1748 if (do_qemu_ram_addr_from_host((void *)siginfo->si_addr, &paddr)) {
1749 fprintf(stderr, "Hardware memory error for memory used by "
1750 "QEMU itself instaed of guest system!\n");
1751 /* Hope we are lucky for AO MCE */
1752 if (siginfo->si_code == BUS_MCEERR_AO)
1753 return;
1754 else
1755 hardware_memory_error();
1757 mce.addr = paddr;
1758 r = kvm_set_mce(env->kvm_cpu_state.vcpu_ctx, &mce);
1759 if (r < 0) {
1760 fprintf(stderr, "kvm_set_mce: %s\n", strerror(errno));
1761 abort();
1763 } else
1764 #endif
1766 if (siginfo->si_code == BUS_MCEERR_AO)
1767 return;
1768 else if (siginfo->si_code == BUS_MCEERR_AR)
1769 hardware_memory_error();
1770 else
1771 sigbus_reraise();
1775 static void kvm_main_loop_wait(CPUState *env, int timeout)
1777 struct timespec ts;
1778 int r, e;
1779 siginfo_t siginfo;
1780 sigset_t waitset;
1781 sigset_t chkset;
1783 ts.tv_sec = timeout / 1000;
1784 ts.tv_nsec = (timeout % 1000) * 1000000;
1785 sigemptyset(&waitset);
1786 sigaddset(&waitset, SIG_IPI);
1787 sigaddset(&waitset, SIGBUS);
1789 do {
1790 pthread_mutex_unlock(&qemu_mutex);
1792 r = sigtimedwait(&waitset, &siginfo, &ts);
1793 e = errno;
1795 pthread_mutex_lock(&qemu_mutex);
1797 if (r == -1 && !(e == EAGAIN || e == EINTR)) {
1798 printf("sigtimedwait: %s\n", strerror(e));
1799 exit(1);
1802 switch (r) {
1803 case SIGBUS:
1804 kvm_on_sigbus(env, &siginfo);
1805 break;
1806 default:
1807 break;
1810 r = sigpending(&chkset);
1811 if (r == -1) {
1812 printf("sigpending: %s\n", strerror(e));
1813 exit(1);
1815 } while (sigismember(&chkset, SIG_IPI) || sigismember(&chkset, SIGBUS));
1817 cpu_single_env = env;
1818 flush_queued_work(env);
1820 if (env->stop) {
1821 env->stop = 0;
1822 env->stopped = 1;
1823 pthread_cond_signal(&qemu_pause_cond);
1826 env->kvm_cpu_state.signalled = 0;
1829 static int all_threads_paused(void)
1831 CPUState *penv = first_cpu;
1833 while (penv) {
1834 if (penv->stop)
1835 return 0;
1836 penv = (CPUState *) penv->next_cpu;
1839 return 1;
1842 static void pause_all_threads(void)
1844 CPUState *penv = first_cpu;
1846 while (penv) {
1847 if (penv != cpu_single_env) {
1848 penv->stop = 1;
1849 pthread_kill(penv->kvm_cpu_state.thread, SIG_IPI);
1850 } else {
1851 penv->stop = 0;
1852 penv->stopped = 1;
1853 cpu_exit(penv);
1855 penv = (CPUState *) penv->next_cpu;
1858 while (!all_threads_paused())
1859 qemu_cond_wait(&qemu_pause_cond);
1862 static void resume_all_threads(void)
1864 CPUState *penv = first_cpu;
1866 assert(!cpu_single_env);
1868 while (penv) {
1869 penv->stop = 0;
1870 penv->stopped = 0;
1871 pthread_kill(penv->kvm_cpu_state.thread, SIG_IPI);
1872 penv = (CPUState *) penv->next_cpu;
1876 static void kvm_vm_state_change_handler(void *context, int running, int reason)
1878 if (running)
1879 resume_all_threads();
1880 else
1881 pause_all_threads();
1884 static void setup_kernel_sigmask(CPUState *env)
1886 sigset_t set;
1888 sigemptyset(&set);
1889 sigaddset(&set, SIGUSR2);
1890 sigaddset(&set, SIGIO);
1891 sigaddset(&set, SIGALRM);
1892 sigprocmask(SIG_BLOCK, &set, NULL);
1894 sigprocmask(SIG_BLOCK, NULL, &set);
1895 sigdelset(&set, SIG_IPI);
1896 sigdelset(&set, SIGBUS);
1898 kvm_set_signal_mask(env->kvm_cpu_state.vcpu_ctx, &set);
1901 static void qemu_kvm_system_reset(void)
1903 CPUState *penv = first_cpu;
1905 pause_all_threads();
1907 qemu_system_reset();
1909 while (penv) {
1910 kvm_arch_cpu_reset(penv);
1911 penv = (CPUState *) penv->next_cpu;
1914 resume_all_threads();
1917 static void process_irqchip_events(CPUState *env)
1919 kvm_arch_process_irqchip_events(env);
1920 if (kvm_arch_has_work(env))
1921 env->halted = 0;
1924 static int kvm_main_loop_cpu(CPUState *env)
1926 while (1) {
1927 int run_cpu = !is_cpu_stopped(env);
1928 if (run_cpu && !kvm_irqchip_in_kernel()) {
1929 process_irqchip_events(env);
1930 run_cpu = !env->halted;
1932 if (run_cpu) {
1933 kvm_main_loop_wait(env, 0);
1934 kvm_cpu_exec(env);
1935 } else {
1936 kvm_main_loop_wait(env, 1000);
1939 pthread_mutex_unlock(&qemu_mutex);
1940 return 0;
1943 static void *ap_main_loop(void *_env)
1945 CPUState *env = _env;
1946 sigset_t signals;
1947 #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
1948 struct ioperm_data *data = NULL;
1949 #endif
1951 current_env = env;
1952 env->thread_id = kvm_get_thread_id();
1953 sigfillset(&signals);
1954 sigprocmask(SIG_BLOCK, &signals, NULL);
1955 env->kvm_cpu_state.vcpu_ctx = kvm_create_vcpu(env, env->cpu_index);
1957 #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
1958 /* do ioperm for io ports of assigned devices */
1959 QLIST_FOREACH(data, &ioperm_head, entries)
1960 on_vcpu(env, kvm_arch_do_ioperm, data);
1961 #endif
1963 setup_kernel_sigmask(env);
1965 pthread_mutex_lock(&qemu_mutex);
1966 cpu_single_env = env;
1968 kvm_arch_init_vcpu(env);
1969 #ifdef TARGET_I386
1970 kvm_tpr_vcpu_start(env);
1971 #endif
1973 kvm_arch_load_regs(env);
1975 /* signal VCPU creation */
1976 current_env->created = 1;
1977 pthread_cond_signal(&qemu_vcpu_cond);
1979 /* and wait for machine initialization */
1980 while (!qemu_system_ready)
1981 qemu_cond_wait(&qemu_system_cond);
1983 /* re-initialize cpu_single_env after re-acquiring qemu_mutex */
1984 cpu_single_env = env;
1986 kvm_main_loop_cpu(env);
1987 return NULL;
1990 void kvm_init_vcpu(CPUState *env)
1992 pthread_create(&env->kvm_cpu_state.thread, NULL, ap_main_loop, env);
1994 while (env->created == 0)
1995 qemu_cond_wait(&qemu_vcpu_cond);
1998 int kvm_vcpu_inited(CPUState *env)
2000 return env->created;
2003 #ifdef TARGET_I386
2004 void kvm_hpet_disable_kpit(void)
2006 struct kvm_pit_state2 ps2;
2008 kvm_get_pit2(kvm_context, &ps2);
2009 ps2.flags |= KVM_PIT_FLAGS_HPET_LEGACY;
2010 kvm_set_pit2(kvm_context, &ps2);
2013 void kvm_hpet_enable_kpit(void)
2015 struct kvm_pit_state2 ps2;
2017 kvm_get_pit2(kvm_context, &ps2);
2018 ps2.flags &= ~KVM_PIT_FLAGS_HPET_LEGACY;
2019 kvm_set_pit2(kvm_context, &ps2);
2021 #endif
2023 int kvm_init_ap(void)
2025 struct sigaction action;
2027 #ifdef TARGET_I386
2028 kvm_tpr_opt_setup();
2029 #endif
2030 qemu_add_vm_change_state_handler(kvm_vm_state_change_handler, NULL);
2032 signal(SIG_IPI, sig_ipi_handler);
2034 memset(&action, 0, sizeof(action));
2035 action.sa_flags = SA_SIGINFO;
2036 action.sa_sigaction = (void (*)(int, siginfo_t*, void*))sigbus_handler;
2037 sigaction(SIGBUS, &action, NULL);
2038 prctl(PR_MCE_KILL, 1, 1);
2039 return 0;
2042 void qemu_kvm_notify_work(void)
2044 uint64_t value = 1;
2045 char buffer[8];
2046 size_t offset = 0;
2048 if (io_thread_fd == -1)
2049 return;
2051 memcpy(buffer, &value, sizeof(value));
2053 while (offset < 8) {
2054 ssize_t len;
2056 len = write(io_thread_fd, buffer + offset, 8 - offset);
2057 if (len == -1 && errno == EINTR)
2058 continue;
2060 /* In case we have a pipe, there is not reason to insist writing
2061 * 8 bytes
2063 if (len == -1 && errno == EAGAIN)
2064 break;
2066 if (len <= 0)
2067 break;
2069 offset += len;
2073 /* If we have signalfd, we mask out the signals we want to handle and then
2074 * use signalfd to listen for them. We rely on whatever the current signal
2075 * handler is to dispatch the signals when we receive them.
2078 static void sigfd_handler(void *opaque)
2080 int fd = (unsigned long) opaque;
2081 struct qemu_signalfd_siginfo info;
2082 struct sigaction action;
2083 ssize_t len;
2085 while (1) {
2086 do {
2087 len = read(fd, &info, sizeof(info));
2088 } while (len == -1 && errno == EINTR);
2090 if (len == -1 && errno == EAGAIN)
2091 break;
2093 if (len != sizeof(info)) {
2094 printf("read from sigfd returned %zd: %m\n", len);
2095 return;
2098 sigaction(info.ssi_signo, NULL, &action);
2099 if ((action.sa_flags & SA_SIGINFO) && action.sa_sigaction)
2100 action.sa_sigaction(info.ssi_signo,
2101 (siginfo_t *)&info, NULL);
2102 else if (action.sa_handler)
2103 action.sa_handler(info.ssi_signo);
2108 /* Used to break IO thread out of select */
2109 static void io_thread_wakeup(void *opaque)
2111 int fd = (unsigned long) opaque;
2112 char buffer[4096];
2114 /* Drain the pipe/(eventfd) */
2115 while (1) {
2116 ssize_t len;
2118 len = read(fd, buffer, sizeof(buffer));
2119 if (len == -1 && errno == EINTR)
2120 continue;
2122 if (len <= 0)
2123 break;
2127 int kvm_main_loop(void)
2129 int fds[2];
2130 sigset_t mask;
2131 int sigfd;
2133 io_thread = pthread_self();
2134 qemu_system_ready = 1;
2136 if (qemu_eventfd(fds) == -1) {
2137 fprintf(stderr, "failed to create eventfd\n");
2138 return -errno;
2141 fcntl(fds[0], F_SETFL, O_NONBLOCK);
2142 fcntl(fds[1], F_SETFL, O_NONBLOCK);
2144 qemu_set_fd_handler2(fds[0], NULL, io_thread_wakeup, NULL,
2145 (void *)(unsigned long) fds[0]);
2147 io_thread_fd = fds[1];
2149 sigemptyset(&mask);
2150 sigaddset(&mask, SIGIO);
2151 sigaddset(&mask, SIGALRM);
2152 sigaddset(&mask, SIGBUS);
2153 sigprocmask(SIG_BLOCK, &mask, NULL);
2155 sigfd = qemu_signalfd(&mask);
2156 if (sigfd == -1) {
2157 fprintf(stderr, "failed to create signalfd\n");
2158 return -errno;
2161 fcntl(sigfd, F_SETFL, O_NONBLOCK);
2163 qemu_set_fd_handler2(sigfd, NULL, sigfd_handler, NULL,
2164 (void *)(unsigned long) sigfd);
2166 pthread_cond_broadcast(&qemu_system_cond);
2168 io_thread_sigfd = sigfd;
2169 cpu_single_env = NULL;
2171 while (1) {
2172 main_loop_wait(1000);
2173 if (qemu_shutdown_requested()) {
2174 if (qemu_no_shutdown()) {
2175 vm_stop(0);
2176 } else
2177 break;
2178 } else if (qemu_powerdown_requested())
2179 qemu_irq_raise(qemu_system_powerdown);
2180 else if (qemu_reset_requested())
2181 qemu_kvm_system_reset();
2182 else if (kvm_debug_cpu_requested) {
2183 gdb_set_stop_cpu(kvm_debug_cpu_requested);
2184 vm_stop(EXCP_DEBUG);
2185 kvm_debug_cpu_requested = NULL;
2189 pause_all_threads();
2190 pthread_mutex_unlock(&qemu_mutex);
2192 return 0;
2195 #ifdef TARGET_I386
2196 static int destroy_region_works = 0;
2197 #endif
2200 #if !defined(TARGET_I386)
2201 int kvm_arch_init_irq_routing(void)
2203 return 0;
2205 #endif
2207 extern int no_hpet;
2209 static int kvm_create_context(void)
2211 int r;
2213 if (!kvm_irqchip) {
2214 kvm_disable_irqchip_creation(kvm_context);
2216 if (!kvm_pit) {
2217 kvm_disable_pit_creation(kvm_context);
2219 if (kvm_create(kvm_context, 0, NULL) < 0) {
2220 kvm_finalize(kvm_state);
2221 return -1;
2223 r = kvm_arch_qemu_create_context();
2224 if (r < 0)
2225 kvm_finalize(kvm_state);
2226 if (kvm_pit && !kvm_pit_reinject) {
2227 if (kvm_reinject_control(kvm_context, 0)) {
2228 fprintf(stderr, "failure to disable in-kernel PIT reinjection\n");
2229 return -1;
2232 #ifdef TARGET_I386
2233 destroy_region_works = kvm_destroy_memory_region_works(kvm_context);
2234 #endif
2236 r = kvm_arch_init_irq_routing();
2237 if (r < 0) {
2238 return r;
2241 kvm_init_ap();
2242 if (kvm_irqchip) {
2243 if (!qemu_kvm_has_gsi_routing()) {
2244 irq0override = 0;
2245 #ifdef TARGET_I386
2246 /* if kernel can't do irq routing, interrupt source
2247 * override 0->2 can not be set up as required by hpet,
2248 * so disable hpet.
2250 no_hpet = 1;
2251 } else if (!qemu_kvm_has_pit_state2()) {
2252 no_hpet = 1;
2254 #else
2256 #endif
2259 return 0;
2262 #ifdef TARGET_I386
2263 static int must_use_aliases_source(target_phys_addr_t addr)
2265 if (destroy_region_works)
2266 return false;
2267 if (addr == 0xa0000 || addr == 0xa8000)
2268 return true;
2269 return false;
2272 static int must_use_aliases_target(target_phys_addr_t addr)
2274 if (destroy_region_works)
2275 return false;
2276 if (addr >= 0xe0000000 && addr < 0x100000000ull)
2277 return true;
2278 return false;
2281 static struct mapping {
2282 target_phys_addr_t phys;
2283 ram_addr_t ram;
2284 ram_addr_t len;
2285 } mappings[50];
2286 static int nr_mappings;
2288 static struct mapping *find_ram_mapping(ram_addr_t ram_addr)
2290 struct mapping *p;
2292 for (p = mappings; p < mappings + nr_mappings; ++p) {
2293 if (p->ram <= ram_addr && ram_addr < p->ram + p->len) {
2294 return p;
2297 return NULL;
2300 static struct mapping *find_mapping(target_phys_addr_t start_addr)
2302 struct mapping *p;
2304 for (p = mappings; p < mappings + nr_mappings; ++p) {
2305 if (p->phys <= start_addr && start_addr < p->phys + p->len) {
2306 return p;
2309 return NULL;
2312 static void drop_mapping(target_phys_addr_t start_addr)
2314 struct mapping *p = find_mapping(start_addr);
2316 if (p)
2317 *p = mappings[--nr_mappings];
2319 #endif
2321 void kvm_set_phys_mem(target_phys_addr_t start_addr, ram_addr_t size,
2322 ram_addr_t phys_offset)
2324 int r = 0;
2325 unsigned long area_flags;
2326 #ifdef TARGET_I386
2327 struct mapping *p;
2328 #endif
2330 if (start_addr + size > phys_ram_size) {
2331 phys_ram_size = start_addr + size;
2334 phys_offset &= ~IO_MEM_ROM;
2335 area_flags = phys_offset & ~TARGET_PAGE_MASK;
2337 if (area_flags != IO_MEM_RAM) {
2338 #ifdef TARGET_I386
2339 if (must_use_aliases_source(start_addr)) {
2340 kvm_destroy_memory_alias(kvm_context, start_addr);
2341 return;
2343 if (must_use_aliases_target(start_addr))
2344 return;
2345 #endif
2346 while (size > 0) {
2347 p = find_mapping(start_addr);
2348 if (p) {
2349 kvm_unregister_memory_area(kvm_context, p->phys, p->len);
2350 drop_mapping(p->phys);
2352 start_addr += TARGET_PAGE_SIZE;
2353 if (size > TARGET_PAGE_SIZE) {
2354 size -= TARGET_PAGE_SIZE;
2355 } else {
2356 size = 0;
2359 return;
2362 r = kvm_is_containing_region(kvm_context, start_addr, size);
2363 if (r)
2364 return;
2366 if (area_flags >= TLB_MMIO)
2367 return;
2369 #ifdef TARGET_I386
2370 if (must_use_aliases_source(start_addr)) {
2371 p = find_ram_mapping(phys_offset);
2372 if (p) {
2373 kvm_create_memory_alias(kvm_context, start_addr, size,
2374 p->phys + (phys_offset - p->ram));
2376 return;
2378 #endif
2380 r = kvm_register_phys_mem(kvm_context, start_addr,
2381 qemu_get_ram_ptr(phys_offset), size, 0);
2382 if (r < 0) {
2383 printf("kvm_cpu_register_physical_memory: failed\n");
2384 exit(1);
2386 #ifdef TARGET_I386
2387 drop_mapping(start_addr);
2388 p = &mappings[nr_mappings++];
2389 p->phys = start_addr;
2390 p->ram = phys_offset;
2391 p->len = size;
2392 #endif
2394 return;
2397 int kvm_setup_guest_memory(void *area, unsigned long size)
2399 int ret = 0;
2401 #ifdef MADV_DONTFORK
2402 if (kvm_enabled() && !kvm_has_sync_mmu())
2403 ret = madvise(area, size, MADV_DONTFORK);
2404 #endif
2406 if (ret)
2407 perror("madvise");
2409 return ret;
2412 #ifdef KVM_CAP_SET_GUEST_DEBUG
2414 struct kvm_set_guest_debug_data {
2415 struct kvm_guest_debug dbg;
2416 int err;
2419 static void kvm_invoke_set_guest_debug(void *data)
2421 struct kvm_set_guest_debug_data *dbg_data = data;
2423 if (cpu_single_env->kvm_cpu_state.regs_modified) {
2424 kvm_arch_put_registers(cpu_single_env);
2425 cpu_single_env->kvm_cpu_state.regs_modified = 0;
2427 dbg_data->err =
2428 kvm_set_guest_debug(cpu_single_env->kvm_cpu_state.vcpu_ctx,
2429 &dbg_data->dbg);
2432 int kvm_update_guest_debug(CPUState *env, unsigned long reinject_trap)
2434 struct kvm_set_guest_debug_data data;
2436 data.dbg.control = 0;
2437 if (env->singlestep_enabled)
2438 data.dbg.control = KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP;
2440 kvm_arch_update_guest_debug(env, &data.dbg);
2441 data.dbg.control |= reinject_trap;
2443 on_vcpu(env, kvm_invoke_set_guest_debug, &data);
2444 return data.err;
2447 #endif
2450 * dirty pages logging
2452 /* FIXME: use unsigned long pointer instead of unsigned char */
2453 unsigned char *kvm_dirty_bitmap = NULL;
2454 int kvm_physical_memory_set_dirty_tracking(int enable)
2456 int r = 0;
2458 if (!kvm_enabled())
2459 return 0;
2461 if (enable) {
2462 if (!kvm_dirty_bitmap) {
2463 unsigned bitmap_size = BITMAP_SIZE(phys_ram_size);
2464 kvm_dirty_bitmap = qemu_malloc(bitmap_size);
2465 r = kvm_dirty_pages_log_enable_all(kvm_context);
2467 } else {
2468 if (kvm_dirty_bitmap) {
2469 r = kvm_dirty_pages_log_reset(kvm_context);
2470 qemu_free(kvm_dirty_bitmap);
2471 kvm_dirty_bitmap = NULL;
2474 return r;
2477 /* get kvm's dirty pages bitmap and update qemu's */
2478 static int kvm_get_dirty_pages_log_range(unsigned long start_addr,
2479 unsigned char *bitmap,
2480 unsigned long offset,
2481 unsigned long mem_size)
2483 unsigned int i, j, n = 0;
2484 unsigned char c;
2485 unsigned long page_number, addr, addr1;
2486 ram_addr_t ram_addr;
2487 unsigned int len = ((mem_size / TARGET_PAGE_SIZE) + 7) / 8;
2490 * bitmap-traveling is faster than memory-traveling (for addr...)
2491 * especially when most of the memory is not dirty.
2493 for (i = 0; i < len; i++) {
2494 c = bitmap[i];
2495 while (c > 0) {
2496 j = ffsl(c) - 1;
2497 c &= ~(1u << j);
2498 page_number = i * 8 + j;
2499 addr1 = page_number * TARGET_PAGE_SIZE;
2500 addr = offset + addr1;
2501 ram_addr = cpu_get_physical_page_desc(addr);
2502 cpu_physical_memory_set_dirty(ram_addr);
2503 n++;
2506 return 0;
2509 static int kvm_get_dirty_bitmap_cb(unsigned long start, unsigned long len,
2510 void *bitmap, void *opaque)
2512 return kvm_get_dirty_pages_log_range(start, bitmap, start, len);
2516 * get kvm's dirty pages bitmap and update qemu's
2517 * we only care about physical ram, which resides in slots 0 and 3
2519 int kvm_update_dirty_pages_log(void)
2521 int r = 0;
2524 r = kvm_get_dirty_pages_range(kvm_context, 0, -1UL, NULL,
2525 kvm_get_dirty_bitmap_cb);
2526 return r;
2529 void kvm_qemu_log_memory(target_phys_addr_t start, target_phys_addr_t size,
2530 int log)
2532 if (log)
2533 kvm_dirty_pages_log_enable_slot(kvm_context, start, size);
2534 else {
2535 #ifdef TARGET_I386
2536 if (must_use_aliases_target(start))
2537 return;
2538 #endif
2539 kvm_dirty_pages_log_disable_slot(kvm_context, start, size);
2543 #ifdef KVM_CAP_IRQCHIP
2545 int kvm_set_irq(int irq, int level, int *status)
2547 return kvm_set_irq_level(kvm_context, irq, level, status);
2550 #endif
2552 int qemu_kvm_get_dirty_pages(unsigned long phys_addr, void *buf)
2554 return kvm_get_dirty_pages(kvm_context, phys_addr, buf);
2557 void kvm_mutex_unlock(void)
2559 assert(!cpu_single_env);
2560 pthread_mutex_unlock(&qemu_mutex);
2563 void kvm_mutex_lock(void)
2565 pthread_mutex_lock(&qemu_mutex);
2566 cpu_single_env = NULL;
2569 #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
2570 void kvm_add_ioperm_data(struct ioperm_data *data)
2572 QLIST_INSERT_HEAD(&ioperm_head, data, entries);
2575 void kvm_remove_ioperm_data(unsigned long start_port, unsigned long num)
2577 struct ioperm_data *data;
2579 data = QLIST_FIRST(&ioperm_head);
2580 while (data) {
2581 struct ioperm_data *next = QLIST_NEXT(data, entries);
2583 if (data->start_port == start_port && data->num == num) {
2584 QLIST_REMOVE(data, entries);
2585 qemu_free(data);
2588 data = next;
2592 void kvm_ioperm(CPUState *env, void *data)
2594 if (kvm_enabled() && qemu_system_ready)
2595 on_vcpu(env, kvm_arch_do_ioperm, data);
2598 #endif
2600 int kvm_physical_sync_dirty_bitmap(target_phys_addr_t start_addr,
2601 target_phys_addr_t end_addr)
2603 #ifndef TARGET_IA64
2605 #ifdef TARGET_I386
2606 if (must_use_aliases_source(start_addr))
2607 return 0;
2608 #endif
2610 kvm_get_dirty_pages_range(kvm_context, start_addr,
2611 end_addr - start_addr, NULL,
2612 kvm_get_dirty_bitmap_cb);
2613 #endif
2614 return 0;
2617 int kvm_log_start(target_phys_addr_t phys_addr, target_phys_addr_t len)
2619 #ifdef TARGET_I386
2620 if (must_use_aliases_source(phys_addr))
2621 return 0;
2622 #endif
2624 #ifndef TARGET_IA64
2625 kvm_qemu_log_memory(phys_addr, len, 1);
2626 #endif
2627 return 0;
2630 int kvm_log_stop(target_phys_addr_t phys_addr, target_phys_addr_t len)
2632 #ifdef TARGET_I386
2633 if (must_use_aliases_source(phys_addr))
2634 return 0;
2635 #endif
2637 #ifndef TARGET_IA64
2638 kvm_qemu_log_memory(phys_addr, len, 0);
2639 #endif
2640 return 0;
2643 int kvm_set_boot_cpu_id(uint32_t id)
2645 return kvm_set_boot_vcpu_id(kvm_context, id);
2648 #ifdef TARGET_I386
2649 #ifdef KVM_CAP_MCE
2650 struct kvm_x86_mce_data {
2651 CPUState *env;
2652 struct kvm_x86_mce *mce;
2653 int abort_on_error;
2656 static void kvm_do_inject_x86_mce(void *_data)
2658 struct kvm_x86_mce_data *data = _data;
2659 int r;
2661 r = kvm_set_mce(data->env->kvm_cpu_state.vcpu_ctx, data->mce);
2662 if (r < 0) {
2663 perror("kvm_set_mce FAILED");
2664 if (data->abort_on_error)
2665 abort();
2668 #endif
2670 void kvm_inject_x86_mce(CPUState *cenv, int bank, uint64_t status,
2671 uint64_t mcg_status, uint64_t addr, uint64_t misc,
2672 int abort_on_error)
2674 #ifdef KVM_CAP_MCE
2675 struct kvm_x86_mce mce = {
2676 .bank = bank,
2677 .status = status,
2678 .mcg_status = mcg_status,
2679 .addr = addr,
2680 .misc = misc,
2682 struct kvm_x86_mce_data data = {
2683 .env = cenv,
2684 .mce = &mce,
2685 .abort_on_error = abort_on_error,
2688 if (!cenv->mcg_cap) {
2689 fprintf(stderr, "MCE support is not enabled!\n");
2690 return;
2692 on_vcpu(cenv, kvm_do_inject_x86_mce, &data);
2693 #else
2694 if (abort_on_error)
2695 abort();
2696 #endif
2698 #endif