use a more upstream friendly version of irqchip-in-kernel test
[qemu-kvm/amd-iommu.git] / qemu-kvm.c
blobd53e9758ade5b6cf9af3bba1d8a01ed37f0cc852
1 /*
2 * qemu/kvm integration
4 * Copyright (C) 2006-2008 Qumranet Technologies
6 * Licensed under the terms of the GNU GPL version 2 or higher.
7 */
8 #include "config.h"
9 #include "config-host.h"
11 #include <assert.h>
12 #include <string.h>
13 #include "hw/hw.h"
14 #include "sysemu.h"
15 #include "qemu-common.h"
16 #include "console.h"
17 #include "block.h"
18 #include "compatfd.h"
19 #include "gdbstub.h"
21 #include "qemu-kvm.h"
22 #include "libkvm.h"
24 #include <pthread.h>
25 #include <sys/utsname.h>
26 #include <sys/syscall.h>
27 #include <sys/mman.h>
28 #include <sys/ioctl.h>
29 #include "compatfd.h"
30 #include <sys/prctl.h>
32 #define false 0
33 #define true 1
35 #ifndef PR_MCE_KILL
36 #define PR_MCE_KILL 33
37 #endif
39 #ifndef BUS_MCEERR_AR
40 #define BUS_MCEERR_AR 4
41 #endif
42 #ifndef BUS_MCEERR_AO
43 #define BUS_MCEERR_AO 5
44 #endif
46 #define EXPECTED_KVM_API_VERSION 12
48 #if EXPECTED_KVM_API_VERSION != KVM_API_VERSION
49 #error libkvm: userspace and kernel version mismatch
50 #endif
52 int kvm_allowed = 1;
53 int kvm_irqchip = 1;
54 int kvm_pit = 1;
55 int kvm_pit_reinject = 1;
56 int kvm_nested = 0;
59 KVMState *kvm_state;
60 kvm_context_t kvm_context;
62 pthread_mutex_t qemu_mutex = PTHREAD_MUTEX_INITIALIZER;
63 pthread_cond_t qemu_vcpu_cond = PTHREAD_COND_INITIALIZER;
64 pthread_cond_t qemu_system_cond = PTHREAD_COND_INITIALIZER;
65 pthread_cond_t qemu_pause_cond = PTHREAD_COND_INITIALIZER;
66 pthread_cond_t qemu_work_cond = PTHREAD_COND_INITIALIZER;
67 __thread CPUState *current_env;
69 static int qemu_system_ready;
71 #define SIG_IPI (SIGRTMIN+4)
73 pthread_t io_thread;
74 static int io_thread_fd = -1;
75 static int io_thread_sigfd = -1;
77 static CPUState *kvm_debug_cpu_requested;
79 static uint64_t phys_ram_size;
81 #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
82 /* The list of ioperm_data */
83 static QLIST_HEAD(, ioperm_data) ioperm_head;
84 #endif
86 //#define DEBUG_MEMREG
87 #ifdef DEBUG_MEMREG
88 #define DPRINTF(fmt, args...) \
89 do { fprintf(stderr, "%s:%d " fmt , __func__, __LINE__, ##args); } while (0)
90 #else
91 #define DPRINTF(fmt, args...) do {} while (0)
92 #endif
94 #define ALIGN(x, y) (((x)+(y)-1) & ~((y)-1))
96 int kvm_abi = EXPECTED_KVM_API_VERSION;
97 int kvm_page_size;
99 #ifdef KVM_CAP_SET_GUEST_DEBUG
100 static int kvm_debug(void *opaque, void *data,
101 struct kvm_debug_exit_arch *arch_info)
103 int handle = kvm_arch_debug(arch_info);
104 CPUState *env = data;
106 if (handle) {
107 kvm_debug_cpu_requested = env;
108 env->stopped = 1;
110 return handle;
112 #endif
114 int kvm_mmio_read(void *opaque, uint64_t addr, uint8_t *data, int len)
116 cpu_physical_memory_rw(addr, data, len, 0);
117 return 0;
120 int kvm_mmio_write(void *opaque, uint64_t addr, uint8_t *data, int len)
122 cpu_physical_memory_rw(addr, data, len, 1);
123 return 0;
126 static int handle_unhandled(uint64_t reason)
128 fprintf(stderr, "kvm: unhandled exit %" PRIx64 "\n", reason);
129 return -EINVAL;
133 static inline void set_gsi(kvm_context_t kvm, unsigned int gsi)
135 uint32_t *bitmap = kvm->used_gsi_bitmap;
137 if (gsi < kvm->max_gsi)
138 bitmap[gsi / 32] |= 1U << (gsi % 32);
139 else
140 DPRINTF("Invalid GSI %d\n");
143 static inline void clear_gsi(kvm_context_t kvm, unsigned int gsi)
145 uint32_t *bitmap = kvm->used_gsi_bitmap;
147 if (gsi < kvm->max_gsi)
148 bitmap[gsi / 32] &= ~(1U << (gsi % 32));
149 else
150 DPRINTF("Invalid GSI %d\n");
153 struct slot_info {
154 unsigned long phys_addr;
155 unsigned long len;
156 unsigned long userspace_addr;
157 unsigned flags;
158 int logging_count;
161 struct slot_info slots[KVM_MAX_NUM_MEM_REGIONS];
163 static void init_slots(void)
165 int i;
167 for (i = 0; i < KVM_MAX_NUM_MEM_REGIONS; ++i)
168 slots[i].len = 0;
171 static int get_free_slot(kvm_context_t kvm)
173 int i;
174 int tss_ext;
176 #if defined(KVM_CAP_SET_TSS_ADDR) && !defined(__s390__)
177 tss_ext = kvm_ioctl(kvm_state, KVM_CHECK_EXTENSION, KVM_CAP_SET_TSS_ADDR);
178 #else
179 tss_ext = 0;
180 #endif
183 * on older kernels where the set tss ioctl is not supprted we must save
184 * slot 0 to hold the extended memory, as the vmx will use the last 3
185 * pages of this slot.
187 if (tss_ext > 0)
188 i = 0;
189 else
190 i = 1;
192 for (; i < KVM_MAX_NUM_MEM_REGIONS; ++i)
193 if (!slots[i].len)
194 return i;
195 return -1;
198 static void register_slot(int slot, unsigned long phys_addr,
199 unsigned long len, unsigned long userspace_addr,
200 unsigned flags)
202 slots[slot].phys_addr = phys_addr;
203 slots[slot].len = len;
204 slots[slot].userspace_addr = userspace_addr;
205 slots[slot].flags = flags;
208 static void free_slot(int slot)
210 slots[slot].len = 0;
211 slots[slot].logging_count = 0;
214 static int get_slot(unsigned long phys_addr)
216 int i;
218 for (i = 0; i < KVM_MAX_NUM_MEM_REGIONS; ++i) {
219 if (slots[i].len && slots[i].phys_addr <= phys_addr &&
220 (slots[i].phys_addr + slots[i].len - 1) >= phys_addr)
221 return i;
223 return -1;
226 /* Returns -1 if this slot is not totally contained on any other,
227 * and the number of the slot otherwise */
228 static int get_container_slot(uint64_t phys_addr, unsigned long size)
230 int i;
232 for (i = 0; i < KVM_MAX_NUM_MEM_REGIONS; ++i)
233 if (slots[i].len && slots[i].phys_addr <= phys_addr &&
234 (slots[i].phys_addr + slots[i].len) >= phys_addr + size)
235 return i;
236 return -1;
239 int kvm_is_containing_region(kvm_context_t kvm, unsigned long phys_addr,
240 unsigned long size)
242 int slot = get_container_slot(phys_addr, size);
243 if (slot == -1)
244 return 0;
245 return 1;
249 * dirty pages logging control
251 static int kvm_dirty_pages_log_change(kvm_context_t kvm,
252 unsigned long phys_addr, unsigned flags,
253 unsigned mask)
255 int r = -1;
256 int slot = get_slot(phys_addr);
258 if (slot == -1) {
259 fprintf(stderr, "BUG: %s: invalid parameters\n", __FUNCTION__);
260 return 1;
263 flags = (slots[slot].flags & ~mask) | flags;
264 if (flags == slots[slot].flags)
265 return 0;
266 slots[slot].flags = flags;
269 struct kvm_userspace_memory_region mem = {
270 .slot = slot,
271 .memory_size = slots[slot].len,
272 .guest_phys_addr = slots[slot].phys_addr,
273 .userspace_addr = slots[slot].userspace_addr,
274 .flags = slots[slot].flags,
278 DPRINTF("slot %d start %llx len %llx flags %x\n",
279 mem.slot, mem.guest_phys_addr, mem.memory_size, mem.flags);
280 r = kvm_vm_ioctl(kvm_state, KVM_SET_USER_MEMORY_REGION, &mem);
281 if (r < 0)
282 fprintf(stderr, "%s: %m\n", __FUNCTION__);
284 return r;
287 static int kvm_dirty_pages_log_change_all(kvm_context_t kvm,
288 int (*change)(kvm_context_t kvm,
289 uint64_t start,
290 uint64_t len))
292 int i, r;
294 for (i = r = 0; i < KVM_MAX_NUM_MEM_REGIONS && r == 0; i++) {
295 if (slots[i].len)
296 r = change(kvm, slots[i].phys_addr, slots[i].len);
298 return r;
301 int kvm_dirty_pages_log_enable_slot(kvm_context_t kvm, uint64_t phys_addr,
302 uint64_t len)
304 int slot = get_slot(phys_addr);
306 DPRINTF("start %" PRIx64 " len %" PRIx64 "\n", phys_addr, len);
307 if (slot == -1) {
308 fprintf(stderr, "BUG: %s: invalid parameters\n", __func__);
309 return -EINVAL;
312 if (slots[slot].logging_count++)
313 return 0;
315 return kvm_dirty_pages_log_change(kvm, slots[slot].phys_addr,
316 KVM_MEM_LOG_DIRTY_PAGES,
317 KVM_MEM_LOG_DIRTY_PAGES);
320 int kvm_dirty_pages_log_disable_slot(kvm_context_t kvm, uint64_t phys_addr,
321 uint64_t len)
323 int slot = get_slot(phys_addr);
325 if (slot == -1) {
326 fprintf(stderr, "BUG: %s: invalid parameters\n", __func__);
327 return -EINVAL;
330 if (--slots[slot].logging_count)
331 return 0;
333 return kvm_dirty_pages_log_change(kvm, slots[slot].phys_addr, 0,
334 KVM_MEM_LOG_DIRTY_PAGES);
338 * Enable dirty page logging for all memory regions
340 int kvm_dirty_pages_log_enable_all(kvm_context_t kvm)
342 if (kvm->dirty_pages_log_all)
343 return 0;
344 kvm->dirty_pages_log_all = 1;
345 return kvm_dirty_pages_log_change_all(kvm, kvm_dirty_pages_log_enable_slot);
349 * Enable dirty page logging only for memory regions that were created with
350 * dirty logging enabled (disable for all other memory regions).
352 int kvm_dirty_pages_log_reset(kvm_context_t kvm)
354 if (!kvm->dirty_pages_log_all)
355 return 0;
356 kvm->dirty_pages_log_all = 0;
357 return kvm_dirty_pages_log_change_all(kvm,
358 kvm_dirty_pages_log_disable_slot);
362 static int kvm_create_context(void);
364 int kvm_init(int smp_cpus)
366 int fd;
367 int r, gsi_count;
370 fd = open("/dev/kvm", O_RDWR);
371 if (fd == -1) {
372 perror("open /dev/kvm");
373 return -1;
375 r = ioctl(fd, KVM_GET_API_VERSION, 0);
376 if (r == -1) {
377 fprintf(stderr,
378 "kvm kernel version too old: "
379 "KVM_GET_API_VERSION ioctl not supported\n");
380 goto out_close;
382 if (r < EXPECTED_KVM_API_VERSION) {
383 fprintf(stderr, "kvm kernel version too old: "
384 "We expect API version %d or newer, but got "
385 "version %d\n", EXPECTED_KVM_API_VERSION, r);
386 goto out_close;
388 if (r > EXPECTED_KVM_API_VERSION) {
389 fprintf(stderr, "kvm userspace version too old\n");
390 goto out_close;
392 kvm_abi = r;
393 kvm_page_size = getpagesize();
394 kvm_state = qemu_mallocz(sizeof(*kvm_state));
395 kvm_context = &kvm_state->kvm_context;
397 kvm_state->fd = fd;
398 kvm_state->vmfd = -1;
399 kvm_context->opaque = cpu_single_env;
400 kvm_context->dirty_pages_log_all = 0;
401 kvm_context->no_irqchip_creation = 0;
402 kvm_context->no_pit_creation = 0;
404 #ifdef KVM_CAP_SET_GUEST_DEBUG
405 QTAILQ_INIT(&kvm_state->kvm_sw_breakpoints);
406 #endif
408 gsi_count = kvm_get_gsi_count(kvm_context);
409 if (gsi_count > 0) {
410 int gsi_bits, i;
412 /* Round up so we can search ints using ffs */
413 gsi_bits = ALIGN(gsi_count, 32);
414 kvm_context->used_gsi_bitmap = qemu_mallocz(gsi_bits / 8);
415 kvm_context->max_gsi = gsi_bits;
417 /* Mark any over-allocated bits as already in use */
418 for (i = gsi_count; i < gsi_bits; i++)
419 set_gsi(kvm_context, i);
422 pthread_mutex_lock(&qemu_mutex);
423 return kvm_create_context();
425 out_close:
426 close(fd);
427 return -1;
430 static void kvm_finalize(KVMState *s)
432 /* FIXME
433 if (kvm->vcpu_fd[0] != -1)
434 close(kvm->vcpu_fd[0]);
435 if (kvm->vm_fd != -1)
436 close(kvm->vm_fd);
438 close(s->fd);
439 free(s);
442 void kvm_disable_irqchip_creation(kvm_context_t kvm)
444 kvm->no_irqchip_creation = 1;
447 void kvm_disable_pit_creation(kvm_context_t kvm)
449 kvm->no_pit_creation = 1;
452 kvm_vcpu_context_t kvm_create_vcpu(CPUState *env, int id)
454 long mmap_size;
455 int r;
456 kvm_vcpu_context_t vcpu_ctx = qemu_malloc(sizeof(struct kvm_vcpu_context));
457 kvm_context_t kvm = kvm_context;
459 vcpu_ctx->kvm = kvm;
460 vcpu_ctx->id = id;
462 r = kvm_vm_ioctl(kvm_state, KVM_CREATE_VCPU, id);
463 if (r < 0) {
464 fprintf(stderr, "kvm_create_vcpu: %m\n");
465 goto err;
467 vcpu_ctx->fd = r;
469 env->kvm_fd = r;
470 env->kvm_state = kvm_state;
472 mmap_size = kvm_ioctl(kvm_state, KVM_GET_VCPU_MMAP_SIZE, 0);
473 if (mmap_size < 0) {
474 fprintf(stderr, "get vcpu mmap size: %m\n");
475 goto err_fd;
477 vcpu_ctx->run =
478 mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, vcpu_ctx->fd,
480 if (vcpu_ctx->run == MAP_FAILED) {
481 fprintf(stderr, "mmap vcpu area: %m\n");
482 goto err_fd;
484 return vcpu_ctx;
485 err_fd:
486 close(vcpu_ctx->fd);
487 err:
488 free(vcpu_ctx);
489 return NULL;
492 static int kvm_set_boot_vcpu_id(kvm_context_t kvm, uint32_t id)
494 #ifdef KVM_CAP_SET_BOOT_CPU_ID
495 int r = kvm_ioctl(kvm_state, KVM_CHECK_EXTENSION, KVM_CAP_SET_BOOT_CPU_ID);
496 if (r > 0)
497 return kvm_vm_ioctl(kvm_state, KVM_SET_BOOT_CPU_ID, id);
498 return -ENOSYS;
499 #else
500 return -ENOSYS;
501 #endif
504 int kvm_create_vm(kvm_context_t kvm)
506 int fd;
507 #ifdef KVM_CAP_IRQ_ROUTING
508 kvm->irq_routes = qemu_mallocz(sizeof(*kvm->irq_routes));
509 kvm->nr_allocated_irq_routes = 0;
510 #endif
512 fd = kvm_ioctl(kvm_state, KVM_CREATE_VM, 0);
513 if (fd < 0) {
514 fprintf(stderr, "kvm_create_vm: %m\n");
515 return -1;
517 kvm_state->vmfd = fd;
518 return 0;
521 static int kvm_create_default_phys_mem(kvm_context_t kvm,
522 unsigned long phys_mem_bytes,
523 void **vm_mem)
525 #ifdef KVM_CAP_USER_MEMORY
526 int r = kvm_ioctl(kvm_state, KVM_CHECK_EXTENSION, KVM_CAP_USER_MEMORY);
527 if (r > 0)
528 return 0;
529 fprintf(stderr,
530 "Hypervisor too old: KVM_CAP_USER_MEMORY extension not supported\n");
531 #else
532 #error Hypervisor too old: KVM_CAP_USER_MEMORY extension not supported
533 #endif
534 return -1;
537 void kvm_create_irqchip(kvm_context_t kvm)
539 int r;
541 kvm->irqchip_in_kernel = 0;
542 #ifdef KVM_CAP_IRQCHIP
543 if (!kvm->no_irqchip_creation) {
544 r = kvm_ioctl(kvm_state, KVM_CHECK_EXTENSION, KVM_CAP_IRQCHIP);
545 if (r > 0) { /* kernel irqchip supported */
546 r = kvm_vm_ioctl(kvm_state, KVM_CREATE_IRQCHIP);
547 if (r >= 0) {
548 kvm->irqchip_inject_ioctl = KVM_IRQ_LINE;
549 #if defined(KVM_CAP_IRQ_INJECT_STATUS) && defined(KVM_IRQ_LINE_STATUS)
550 r = kvm_ioctl(kvm_state, KVM_CHECK_EXTENSION,
551 KVM_CAP_IRQ_INJECT_STATUS);
552 if (r > 0)
553 kvm->irqchip_inject_ioctl = KVM_IRQ_LINE_STATUS;
554 #endif
555 kvm->irqchip_in_kernel = 1;
556 } else
557 fprintf(stderr, "Create kernel PIC irqchip failed\n");
560 #endif
561 kvm_state->irqchip_in_kernel = kvm->irqchip_in_kernel;
564 int kvm_create(kvm_context_t kvm, unsigned long phys_mem_bytes, void **vm_mem)
566 int r;
568 r = kvm_create_vm(kvm);
569 if (r < 0)
570 return r;
571 r = kvm_arch_create(kvm, phys_mem_bytes, vm_mem);
572 if (r < 0)
573 return r;
574 init_slots();
575 r = kvm_create_default_phys_mem(kvm, phys_mem_bytes, vm_mem);
576 if (r < 0)
577 return r;
578 kvm_create_irqchip(kvm);
580 return 0;
584 int kvm_register_phys_mem(kvm_context_t kvm,
585 unsigned long phys_start, void *userspace_addr,
586 unsigned long len, int log)
589 struct kvm_userspace_memory_region memory = {
590 .memory_size = len,
591 .guest_phys_addr = phys_start,
592 .userspace_addr = (unsigned long) (uintptr_t) userspace_addr,
593 .flags = log ? KVM_MEM_LOG_DIRTY_PAGES : 0,
595 int r;
597 memory.slot = get_free_slot(kvm);
598 DPRINTF
599 ("memory: gpa: %llx, size: %llx, uaddr: %llx, slot: %x, flags: %lx\n",
600 memory.guest_phys_addr, memory.memory_size, memory.userspace_addr,
601 memory.slot, memory.flags);
602 r = kvm_vm_ioctl(kvm_state, KVM_SET_USER_MEMORY_REGION, &memory);
603 if (r < 0) {
604 fprintf(stderr, "create_userspace_phys_mem: %s\n", strerror(-r));
605 return -1;
607 register_slot(memory.slot, memory.guest_phys_addr, memory.memory_size,
608 memory.userspace_addr, memory.flags);
609 return 0;
613 /* destroy/free a whole slot.
614 * phys_start, len and slot are the params passed to kvm_create_phys_mem()
616 void kvm_destroy_phys_mem(kvm_context_t kvm, unsigned long phys_start,
617 unsigned long len)
619 int slot;
620 int r;
621 struct kvm_userspace_memory_region memory = {
622 .memory_size = 0,
623 .guest_phys_addr = phys_start,
624 .userspace_addr = 0,
625 .flags = 0,
628 slot = get_slot(phys_start);
630 if ((slot >= KVM_MAX_NUM_MEM_REGIONS) || (slot == -1)) {
631 fprintf(stderr, "BUG: %s: invalid parameters (slot=%d)\n", __FUNCTION__,
632 slot);
633 return;
635 if (phys_start != slots[slot].phys_addr) {
636 fprintf(stderr,
637 "WARNING: %s: phys_start is 0x%lx expecting 0x%lx\n",
638 __FUNCTION__, phys_start, slots[slot].phys_addr);
639 phys_start = slots[slot].phys_addr;
642 memory.slot = slot;
643 DPRINTF("slot %d start %llx len %llx flags %x\n",
644 memory.slot, memory.guest_phys_addr, memory.memory_size,
645 memory.flags);
646 r = kvm_vm_ioctl(kvm_state, KVM_SET_USER_MEMORY_REGION, &memory);
647 if (r < 0) {
648 fprintf(stderr, "destroy_userspace_phys_mem: %s", strerror(-r));
649 return;
652 free_slot(memory.slot);
655 void kvm_unregister_memory_area(kvm_context_t kvm, uint64_t phys_addr,
656 unsigned long size)
659 int slot = get_container_slot(phys_addr, size);
661 if (slot != -1) {
662 DPRINTF("Unregistering memory region %llx (%lx)\n", phys_addr, size);
663 kvm_destroy_phys_mem(kvm, phys_addr, size);
664 return;
668 static int kvm_get_map(kvm_context_t kvm, int ioctl_num, int slot, void *buf)
670 int r;
671 struct kvm_dirty_log log = {
672 .slot = slot,
675 log.dirty_bitmap = buf;
677 r = kvm_vm_ioctl(kvm_state, ioctl_num, &log);
678 if (r < 0)
679 return r;
680 return 0;
683 int kvm_get_dirty_pages(kvm_context_t kvm, unsigned long phys_addr, void *buf)
685 int slot;
687 slot = get_slot(phys_addr);
688 return kvm_get_map(kvm, KVM_GET_DIRTY_LOG, slot, buf);
691 int kvm_get_dirty_pages_range(kvm_context_t kvm, unsigned long phys_addr,
692 unsigned long len, void *opaque,
693 int (*cb)(unsigned long start,
694 unsigned long len, void *bitmap,
695 void *opaque))
697 int i;
698 int r;
699 unsigned long end_addr = phys_addr + len;
700 void *buf;
702 for (i = 0; i < KVM_MAX_NUM_MEM_REGIONS; ++i) {
703 if ((slots[i].len && (uint64_t) slots[i].phys_addr >= phys_addr)
704 && ((uint64_t) slots[i].phys_addr + slots[i].len <= end_addr)) {
705 buf = qemu_malloc(BITMAP_SIZE(slots[i].len));
706 r = kvm_get_map(kvm, KVM_GET_DIRTY_LOG, i, buf);
707 if (r) {
708 qemu_free(buf);
709 return r;
711 r = cb(slots[i].phys_addr, slots[i].len, buf, opaque);
712 qemu_free(buf);
713 if (r)
714 return r;
717 return 0;
720 #ifdef KVM_CAP_IRQCHIP
722 int kvm_set_irq_level(kvm_context_t kvm, int irq, int level, int *status)
724 struct kvm_irq_level event;
725 int r;
727 if (!kvm->irqchip_in_kernel)
728 return 0;
729 event.level = level;
730 event.irq = irq;
731 r = kvm_vm_ioctl(kvm_state, kvm->irqchip_inject_ioctl, &event);
732 if (r < 0)
733 perror("kvm_set_irq_level");
735 if (status) {
736 #ifdef KVM_CAP_IRQ_INJECT_STATUS
737 *status =
738 (kvm->irqchip_inject_ioctl == KVM_IRQ_LINE) ? 1 : event.status;
739 #else
740 *status = 1;
741 #endif
744 return 1;
747 int kvm_get_irqchip(kvm_context_t kvm, struct kvm_irqchip *chip)
749 int r;
751 if (!kvm->irqchip_in_kernel)
752 return 0;
753 r = kvm_vm_ioctl(kvm_state, KVM_GET_IRQCHIP, chip);
754 if (r < 0) {
755 perror("kvm_get_irqchip\n");
757 return r;
760 int kvm_set_irqchip(kvm_context_t kvm, struct kvm_irqchip *chip)
762 int r;
764 if (!kvm->irqchip_in_kernel)
765 return 0;
766 r = kvm_vm_ioctl(kvm_state, KVM_SET_IRQCHIP, chip);
767 if (r < 0) {
768 perror("kvm_set_irqchip\n");
770 return r;
773 #endif
775 static int handle_io(kvm_vcpu_context_t vcpu)
777 struct kvm_run *run = vcpu->run;
778 uint16_t addr = run->io.port;
779 int i;
780 void *p = (void *) run + run->io.data_offset;
782 for (i = 0; i < run->io.count; ++i) {
783 switch (run->io.direction) {
784 case KVM_EXIT_IO_IN:
785 switch (run->io.size) {
786 case 1:
787 *(uint8_t *) p = cpu_inb(addr);
788 break;
789 case 2:
790 *(uint16_t *) p = cpu_inw(addr);
791 break;
792 case 4:
793 *(uint32_t *) p = cpu_inl(addr);
794 break;
795 default:
796 fprintf(stderr, "bad I/O size %d\n", run->io.size);
797 return -EMSGSIZE;
799 break;
800 case KVM_EXIT_IO_OUT:
801 switch (run->io.size) {
802 case 1:
803 cpu_outb(addr, *(uint8_t *) p);
804 break;
805 case 2:
806 cpu_outw(addr, *(uint16_t *) p);
807 break;
808 case 4:
809 cpu_outl(addr, *(uint32_t *) p);
810 break;
811 default:
812 fprintf(stderr, "bad I/O size %d\n", run->io.size);
813 return -EMSGSIZE;
815 break;
816 default:
817 fprintf(stderr, "bad I/O direction %d\n", run->io.direction);
818 return -EPROTO;
821 p += run->io.size;
824 return 0;
827 int handle_debug(kvm_vcpu_context_t vcpu, void *env)
829 #ifdef KVM_CAP_SET_GUEST_DEBUG
830 struct kvm_run *run = vcpu->run;
831 kvm_context_t kvm = vcpu->kvm;
833 return kvm_debug(kvm->opaque, env, &run->debug.arch);
834 #else
835 return 0;
836 #endif
839 int kvm_get_regs(kvm_vcpu_context_t vcpu, struct kvm_regs *regs)
841 return ioctl(vcpu->fd, KVM_GET_REGS, regs);
844 int kvm_set_regs(kvm_vcpu_context_t vcpu, struct kvm_regs *regs)
846 return ioctl(vcpu->fd, KVM_SET_REGS, regs);
849 int kvm_get_fpu(kvm_vcpu_context_t vcpu, struct kvm_fpu *fpu)
851 return ioctl(vcpu->fd, KVM_GET_FPU, fpu);
854 int kvm_set_fpu(kvm_vcpu_context_t vcpu, struct kvm_fpu *fpu)
856 return ioctl(vcpu->fd, KVM_SET_FPU, fpu);
859 int kvm_get_sregs(kvm_vcpu_context_t vcpu, struct kvm_sregs *sregs)
861 return ioctl(vcpu->fd, KVM_GET_SREGS, sregs);
864 int kvm_set_sregs(kvm_vcpu_context_t vcpu, struct kvm_sregs *sregs)
866 return ioctl(vcpu->fd, KVM_SET_SREGS, sregs);
869 #ifdef KVM_CAP_MP_STATE
870 int kvm_get_mpstate(kvm_vcpu_context_t vcpu, struct kvm_mp_state *mp_state)
872 int r;
874 r = kvm_ioctl(kvm_state, KVM_CHECK_EXTENSION, KVM_CAP_MP_STATE);
875 if (r > 0)
876 return ioctl(vcpu->fd, KVM_GET_MP_STATE, mp_state);
877 return -ENOSYS;
880 int kvm_set_mpstate(kvm_vcpu_context_t vcpu, struct kvm_mp_state *mp_state)
882 int r;
884 r = kvm_ioctl(kvm_state, KVM_CHECK_EXTENSION, KVM_CAP_MP_STATE);
885 if (r > 0)
886 return ioctl(vcpu->fd, KVM_SET_MP_STATE, mp_state);
887 return -ENOSYS;
889 #endif
891 static int handle_mmio(kvm_vcpu_context_t vcpu)
893 unsigned long addr = vcpu->run->mmio.phys_addr;
894 kvm_context_t kvm = vcpu->kvm;
895 struct kvm_run *kvm_run = vcpu->run;
896 void *data = kvm_run->mmio.data;
898 /* hack: Red Hat 7.1 generates these weird accesses. */
899 if ((addr > 0xa0000 - 4 && addr <= 0xa0000) && kvm_run->mmio.len == 3)
900 return 0;
902 if (kvm_run->mmio.is_write)
903 return kvm_mmio_write(kvm->opaque, addr, data, kvm_run->mmio.len);
904 else
905 return kvm_mmio_read(kvm->opaque, addr, data, kvm_run->mmio.len);
908 int handle_io_window(kvm_context_t kvm)
910 return 1;
913 int handle_halt(kvm_vcpu_context_t vcpu)
915 return kvm_arch_halt(vcpu->kvm->opaque, vcpu);
918 int handle_shutdown(kvm_context_t kvm, CPUState *env)
920 /* stop the current vcpu from going back to guest mode */
921 env->stopped = 1;
923 qemu_system_reset_request();
924 return 1;
927 static inline void push_nmi(kvm_context_t kvm)
929 #ifdef KVM_CAP_USER_NMI
930 kvm_arch_push_nmi(kvm->opaque);
931 #endif /* KVM_CAP_USER_NMI */
934 void post_kvm_run(kvm_context_t kvm, CPUState *env)
936 pthread_mutex_lock(&qemu_mutex);
937 kvm_arch_post_kvm_run(kvm->opaque, env);
940 int pre_kvm_run(kvm_context_t kvm, CPUState *env)
942 kvm_arch_pre_kvm_run(kvm->opaque, env);
944 pthread_mutex_unlock(&qemu_mutex);
945 return 0;
948 int kvm_get_interrupt_flag(kvm_vcpu_context_t vcpu)
950 return vcpu->run->if_flag;
953 int kvm_is_ready_for_interrupt_injection(kvm_vcpu_context_t vcpu)
955 return vcpu->run->ready_for_interrupt_injection;
958 int kvm_run(kvm_vcpu_context_t vcpu, void *env)
960 int r;
961 int fd = vcpu->fd;
962 struct kvm_run *run = vcpu->run;
963 kvm_context_t kvm = vcpu->kvm;
964 CPUState *_env = env;
966 again:
967 push_nmi(kvm);
968 #if !defined(__s390__)
969 if (!kvm->irqchip_in_kernel)
970 run->request_interrupt_window = kvm_arch_try_push_interrupts(env);
971 #endif
973 if (_env->kvm_cpu_state.regs_modified) {
974 kvm_arch_put_registers(_env);
975 _env->kvm_cpu_state.regs_modified = 0;
978 r = pre_kvm_run(kvm, env);
979 if (r)
980 return r;
981 r = ioctl(fd, KVM_RUN, 0);
983 if (r == -1 && errno != EINTR && errno != EAGAIN) {
984 r = -errno;
985 post_kvm_run(kvm, env);
986 fprintf(stderr, "kvm_run: %s\n", strerror(-r));
987 return r;
990 post_kvm_run(kvm, env);
992 #if defined(KVM_CAP_COALESCED_MMIO)
993 if (kvm_state->coalesced_mmio) {
994 struct kvm_coalesced_mmio_ring *ring =
995 (void *) run + kvm_state->coalesced_mmio * PAGE_SIZE;
996 while (ring->first != ring->last) {
997 kvm_mmio_write(kvm->opaque,
998 ring->coalesced_mmio[ring->first].phys_addr,
999 &ring->coalesced_mmio[ring->first].data[0],
1000 ring->coalesced_mmio[ring->first].len);
1001 smp_wmb();
1002 ring->first = (ring->first + 1) % KVM_COALESCED_MMIO_MAX;
1005 #endif
1007 #if !defined(__s390__)
1008 if (r == -1) {
1009 r = handle_io_window(kvm);
1010 goto more;
1012 #endif
1013 if (1) {
1014 switch (run->exit_reason) {
1015 case KVM_EXIT_UNKNOWN:
1016 r = handle_unhandled(run->hw.hardware_exit_reason);
1017 break;
1018 case KVM_EXIT_FAIL_ENTRY:
1019 r = handle_unhandled(run->fail_entry.hardware_entry_failure_reason);
1020 break;
1021 case KVM_EXIT_EXCEPTION:
1022 fprintf(stderr, "exception %d (%x)\n", run->ex.exception,
1023 run->ex.error_code);
1024 kvm_show_regs(vcpu);
1025 kvm_show_code(vcpu);
1026 abort();
1027 break;
1028 case KVM_EXIT_IO:
1029 r = handle_io(vcpu);
1030 break;
1031 case KVM_EXIT_DEBUG:
1032 r = handle_debug(vcpu, env);
1033 break;
1034 case KVM_EXIT_MMIO:
1035 r = handle_mmio(vcpu);
1036 break;
1037 case KVM_EXIT_HLT:
1038 r = handle_halt(vcpu);
1039 break;
1040 case KVM_EXIT_IRQ_WINDOW_OPEN:
1041 break;
1042 case KVM_EXIT_SHUTDOWN:
1043 r = handle_shutdown(kvm, env);
1044 break;
1045 #if defined(__s390__)
1046 case KVM_EXIT_S390_SIEIC:
1047 r = kvm_s390_handle_intercept(kvm, vcpu, run);
1048 break;
1049 case KVM_EXIT_S390_RESET:
1050 r = kvm_s390_handle_reset(kvm, vcpu, run);
1051 break;
1052 #endif
1053 case KVM_EXIT_INTERNAL_ERROR:
1054 fprintf(stderr, "KVM internal error. Suberror: %d\n",
1055 run->internal.suberror);
1056 kvm_show_regs(vcpu);
1057 if (run->internal.suberror == KVM_INTERNAL_ERROR_EMULATION)
1058 fprintf(stderr, "emulation failure, check dmesg for details\n");
1059 abort();
1060 break;
1061 default:
1062 if (kvm_arch_run(vcpu)) {
1063 fprintf(stderr, "unhandled vm exit: 0x%x\n", run->exit_reason);
1064 kvm_show_regs(vcpu);
1065 abort();
1067 break;
1070 more:
1071 if (!r)
1072 goto again;
1073 return r;
1076 int kvm_inject_irq(kvm_vcpu_context_t vcpu, unsigned irq)
1078 struct kvm_interrupt intr;
1080 intr.irq = irq;
1081 return ioctl(vcpu->fd, KVM_INTERRUPT, &intr);
1084 #ifdef KVM_CAP_SET_GUEST_DEBUG
1085 int kvm_set_guest_debug(kvm_vcpu_context_t vcpu, struct kvm_guest_debug *dbg)
1087 return ioctl(vcpu->fd, KVM_SET_GUEST_DEBUG, dbg);
1089 #endif
1091 int kvm_set_signal_mask(kvm_vcpu_context_t vcpu, const sigset_t *sigset)
1093 struct kvm_signal_mask *sigmask;
1094 int r;
1096 if (!sigset) {
1097 r = ioctl(vcpu->fd, KVM_SET_SIGNAL_MASK, NULL);
1098 if (r == -1)
1099 r = -errno;
1100 return r;
1102 sigmask = qemu_malloc(sizeof(*sigmask) + sizeof(*sigset));
1104 sigmask->len = 8;
1105 memcpy(sigmask->sigset, sigset, sizeof(*sigset));
1106 r = ioctl(vcpu->fd, KVM_SET_SIGNAL_MASK, sigmask);
1107 if (r == -1)
1108 r = -errno;
1109 free(sigmask);
1110 return r;
1113 int kvm_pit_in_kernel(kvm_context_t kvm)
1115 return kvm->pit_in_kernel;
1118 int kvm_inject_nmi(kvm_vcpu_context_t vcpu)
1120 #ifdef KVM_CAP_USER_NMI
1121 return ioctl(vcpu->fd, KVM_NMI);
1122 #else
1123 return -ENOSYS;
1124 #endif
1127 int kvm_init_coalesced_mmio(kvm_context_t kvm)
1129 int r = 0;
1130 kvm_state->coalesced_mmio = 0;
1131 #ifdef KVM_CAP_COALESCED_MMIO
1132 r = kvm_ioctl(kvm_state, KVM_CHECK_EXTENSION, KVM_CAP_COALESCED_MMIO);
1133 if (r > 0) {
1134 kvm_state->coalesced_mmio = r;
1135 return 0;
1137 #endif
1138 return r;
1141 #ifdef KVM_CAP_DEVICE_ASSIGNMENT
1142 int kvm_assign_pci_device(kvm_context_t kvm,
1143 struct kvm_assigned_pci_dev *assigned_dev)
1145 return kvm_vm_ioctl(kvm_state, KVM_ASSIGN_PCI_DEVICE, assigned_dev);
1148 static int kvm_old_assign_irq(kvm_context_t kvm,
1149 struct kvm_assigned_irq *assigned_irq)
1151 return kvm_vm_ioctl(kvm_state, KVM_ASSIGN_IRQ, assigned_irq);
1154 #ifdef KVM_CAP_ASSIGN_DEV_IRQ
1155 int kvm_assign_irq(kvm_context_t kvm, struct kvm_assigned_irq *assigned_irq)
1157 int ret;
1159 ret = kvm_ioctl(kvm_state, KVM_CHECK_EXTENSION, KVM_CAP_ASSIGN_DEV_IRQ);
1160 if (ret > 0) {
1161 return kvm_vm_ioctl(kvm_state, KVM_ASSIGN_DEV_IRQ, assigned_irq);
1164 return kvm_old_assign_irq(kvm, assigned_irq);
1167 int kvm_deassign_irq(kvm_context_t kvm, struct kvm_assigned_irq *assigned_irq)
1169 return kvm_vm_ioctl(kvm_state, KVM_DEASSIGN_DEV_IRQ, assigned_irq);
1171 #else
1172 int kvm_assign_irq(kvm_context_t kvm, struct kvm_assigned_irq *assigned_irq)
1174 return kvm_old_assign_irq(kvm, assigned_irq);
1176 #endif
1177 #endif
1179 #ifdef KVM_CAP_DEVICE_DEASSIGNMENT
1180 int kvm_deassign_pci_device(kvm_context_t kvm,
1181 struct kvm_assigned_pci_dev *assigned_dev)
1183 return kvm_vm_ioctl(kvm_state, KVM_DEASSIGN_PCI_DEVICE, assigned_dev);
1185 #endif
1187 int kvm_destroy_memory_region_works(kvm_context_t kvm)
1189 int ret = 0;
1191 #ifdef KVM_CAP_DESTROY_MEMORY_REGION_WORKS
1192 ret =
1193 kvm_ioctl(kvm_state, KVM_CHECK_EXTENSION,
1194 KVM_CAP_DESTROY_MEMORY_REGION_WORKS);
1195 if (ret <= 0)
1196 ret = 0;
1197 #endif
1198 return ret;
1201 int kvm_reinject_control(kvm_context_t kvm, int pit_reinject)
1203 #ifdef KVM_CAP_REINJECT_CONTROL
1204 int r;
1205 struct kvm_reinject_control control;
1207 control.pit_reinject = pit_reinject;
1209 r = kvm_ioctl(kvm_state, KVM_CHECK_EXTENSION, KVM_CAP_REINJECT_CONTROL);
1210 if (r > 0) {
1211 return kvm_vm_ioctl(kvm_state, KVM_REINJECT_CONTROL, &control);
1213 #endif
1214 return -ENOSYS;
1217 int kvm_has_gsi_routing(kvm_context_t kvm)
1219 int r = 0;
1221 #ifdef KVM_CAP_IRQ_ROUTING
1222 r = kvm_check_extension(kvm_state, KVM_CAP_IRQ_ROUTING);
1223 #endif
1224 return r;
1227 int kvm_get_gsi_count(kvm_context_t kvm)
1229 #ifdef KVM_CAP_IRQ_ROUTING
1230 return kvm_check_extension(kvm_state, KVM_CAP_IRQ_ROUTING);
1231 #else
1232 return -EINVAL;
1233 #endif
1236 int kvm_clear_gsi_routes(kvm_context_t kvm)
1238 #ifdef KVM_CAP_IRQ_ROUTING
1239 kvm->irq_routes->nr = 0;
1240 return 0;
1241 #else
1242 return -EINVAL;
1243 #endif
1246 int kvm_add_routing_entry(kvm_context_t kvm,
1247 struct kvm_irq_routing_entry *entry)
1249 #ifdef KVM_CAP_IRQ_ROUTING
1250 struct kvm_irq_routing *z;
1251 struct kvm_irq_routing_entry *new;
1252 int n, size;
1254 if (kvm->irq_routes->nr == kvm->nr_allocated_irq_routes) {
1255 n = kvm->nr_allocated_irq_routes * 2;
1256 if (n < 64)
1257 n = 64;
1258 size = sizeof(struct kvm_irq_routing);
1259 size += n * sizeof(*new);
1260 z = realloc(kvm->irq_routes, size);
1261 if (!z)
1262 return -ENOMEM;
1263 kvm->nr_allocated_irq_routes = n;
1264 kvm->irq_routes = z;
1266 n = kvm->irq_routes->nr++;
1267 new = &kvm->irq_routes->entries[n];
1268 memset(new, 0, sizeof(*new));
1269 new->gsi = entry->gsi;
1270 new->type = entry->type;
1271 new->flags = entry->flags;
1272 new->u = entry->u;
1274 set_gsi(kvm, entry->gsi);
1276 return 0;
1277 #else
1278 return -ENOSYS;
1279 #endif
1282 int kvm_add_irq_route(kvm_context_t kvm, int gsi, int irqchip, int pin)
1284 #ifdef KVM_CAP_IRQ_ROUTING
1285 struct kvm_irq_routing_entry e;
1287 e.gsi = gsi;
1288 e.type = KVM_IRQ_ROUTING_IRQCHIP;
1289 e.flags = 0;
1290 e.u.irqchip.irqchip = irqchip;
1291 e.u.irqchip.pin = pin;
1292 return kvm_add_routing_entry(kvm, &e);
1293 #else
1294 return -ENOSYS;
1295 #endif
1298 int kvm_del_routing_entry(kvm_context_t kvm,
1299 struct kvm_irq_routing_entry *entry)
1301 #ifdef KVM_CAP_IRQ_ROUTING
1302 struct kvm_irq_routing_entry *e, *p;
1303 int i, gsi, found = 0;
1305 gsi = entry->gsi;
1307 for (i = 0; i < kvm->irq_routes->nr; ++i) {
1308 e = &kvm->irq_routes->entries[i];
1309 if (e->type == entry->type && e->gsi == gsi) {
1310 switch (e->type) {
1311 case KVM_IRQ_ROUTING_IRQCHIP:{
1312 if (e->u.irqchip.irqchip ==
1313 entry->u.irqchip.irqchip
1314 && e->u.irqchip.pin == entry->u.irqchip.pin) {
1315 p = &kvm->irq_routes->entries[--kvm->irq_routes->nr];
1316 *e = *p;
1317 found = 1;
1319 break;
1321 case KVM_IRQ_ROUTING_MSI:{
1322 if (e->u.msi.address_lo ==
1323 entry->u.msi.address_lo
1324 && e->u.msi.address_hi ==
1325 entry->u.msi.address_hi
1326 && e->u.msi.data == entry->u.msi.data) {
1327 p = &kvm->irq_routes->entries[--kvm->irq_routes->nr];
1328 *e = *p;
1329 found = 1;
1331 break;
1333 default:
1334 break;
1336 if (found) {
1337 /* If there are no other users of this GSI
1338 * mark it available in the bitmap */
1339 for (i = 0; i < kvm->irq_routes->nr; i++) {
1340 e = &kvm->irq_routes->entries[i];
1341 if (e->gsi == gsi)
1342 break;
1344 if (i == kvm->irq_routes->nr)
1345 clear_gsi(kvm, gsi);
1347 return 0;
1351 return -ESRCH;
1352 #else
1353 return -ENOSYS;
1354 #endif
1357 int kvm_update_routing_entry(kvm_context_t kvm,
1358 struct kvm_irq_routing_entry *entry,
1359 struct kvm_irq_routing_entry *newentry)
1361 #ifdef KVM_CAP_IRQ_ROUTING
1362 struct kvm_irq_routing_entry *e;
1363 int i;
1365 if (entry->gsi != newentry->gsi || entry->type != newentry->type) {
1366 return -EINVAL;
1369 for (i = 0; i < kvm->irq_routes->nr; ++i) {
1370 e = &kvm->irq_routes->entries[i];
1371 if (e->type != entry->type || e->gsi != entry->gsi) {
1372 continue;
1374 switch (e->type) {
1375 case KVM_IRQ_ROUTING_IRQCHIP:
1376 if (e->u.irqchip.irqchip == entry->u.irqchip.irqchip &&
1377 e->u.irqchip.pin == entry->u.irqchip.pin) {
1378 memcpy(&e->u.irqchip, &newentry->u.irqchip,
1379 sizeof e->u.irqchip);
1380 return 0;
1382 break;
1383 case KVM_IRQ_ROUTING_MSI:
1384 if (e->u.msi.address_lo == entry->u.msi.address_lo &&
1385 e->u.msi.address_hi == entry->u.msi.address_hi &&
1386 e->u.msi.data == entry->u.msi.data) {
1387 memcpy(&e->u.msi, &newentry->u.msi, sizeof e->u.msi);
1388 return 0;
1390 break;
1391 default:
1392 break;
1395 return -ESRCH;
1396 #else
1397 return -ENOSYS;
1398 #endif
1401 int kvm_del_irq_route(kvm_context_t kvm, int gsi, int irqchip, int pin)
1403 #ifdef KVM_CAP_IRQ_ROUTING
1404 struct kvm_irq_routing_entry e;
1406 e.gsi = gsi;
1407 e.type = KVM_IRQ_ROUTING_IRQCHIP;
1408 e.flags = 0;
1409 e.u.irqchip.irqchip = irqchip;
1410 e.u.irqchip.pin = pin;
1411 return kvm_del_routing_entry(kvm, &e);
1412 #else
1413 return -ENOSYS;
1414 #endif
1417 int kvm_commit_irq_routes(kvm_context_t kvm)
1419 #ifdef KVM_CAP_IRQ_ROUTING
1420 kvm->irq_routes->flags = 0;
1421 return kvm_vm_ioctl(kvm_state, KVM_SET_GSI_ROUTING, kvm->irq_routes);
1422 #else
1423 return -ENOSYS;
1424 #endif
1427 int kvm_get_irq_route_gsi(kvm_context_t kvm)
1429 int i, bit;
1430 uint32_t *buf = kvm->used_gsi_bitmap;
1432 /* Return the lowest unused GSI in the bitmap */
1433 for (i = 0; i < kvm->max_gsi / 32; i++) {
1434 bit = ffs(~buf[i]);
1435 if (!bit)
1436 continue;
1438 return bit - 1 + i * 32;
1441 return -ENOSPC;
1444 #ifdef KVM_CAP_DEVICE_MSIX
1445 int kvm_assign_set_msix_nr(kvm_context_t kvm,
1446 struct kvm_assigned_msix_nr *msix_nr)
1448 return kvm_vm_ioctl(kvm_state, KVM_ASSIGN_SET_MSIX_NR, msix_nr);
1451 int kvm_assign_set_msix_entry(kvm_context_t kvm,
1452 struct kvm_assigned_msix_entry *entry)
1454 return kvm_vm_ioctl(kvm_state, KVM_ASSIGN_SET_MSIX_ENTRY, entry);
1456 #endif
1458 #if defined(KVM_CAP_IRQFD) && defined(CONFIG_EVENTFD)
1460 #include <sys/eventfd.h>
1462 static int _kvm_irqfd(kvm_context_t kvm, int fd, int gsi, int flags)
1464 struct kvm_irqfd data = {
1465 .fd = fd,
1466 .gsi = gsi,
1467 .flags = flags,
1470 return kvm_vm_ioctl(kvm_state, KVM_IRQFD, &data);
1473 int kvm_irqfd(kvm_context_t kvm, int gsi, int flags)
1475 int r;
1476 int fd;
1478 if (!kvm_check_extension(kvm_state, KVM_CAP_IRQFD))
1479 return -ENOENT;
1481 fd = eventfd(0, 0);
1482 if (fd < 0)
1483 return -errno;
1485 r = _kvm_irqfd(kvm, fd, gsi, 0);
1486 if (r < 0) {
1487 close(fd);
1488 return -errno;
1491 return fd;
1494 #else /* KVM_CAP_IRQFD */
1496 int kvm_irqfd(kvm_context_t kvm, int gsi, int flags)
1498 return -ENOSYS;
1501 #endif /* KVM_CAP_IRQFD */
1502 static inline unsigned long kvm_get_thread_id(void)
1504 return syscall(SYS_gettid);
1507 static void qemu_cond_wait(pthread_cond_t *cond)
1509 CPUState *env = cpu_single_env;
1511 pthread_cond_wait(cond, &qemu_mutex);
1512 cpu_single_env = env;
1515 static void sig_ipi_handler(int n)
1519 static void hardware_memory_error(void)
1521 fprintf(stderr, "Hardware memory error!\n");
1522 exit(1);
1525 static void sigbus_reraise(void)
1527 sigset_t set;
1528 struct sigaction action;
1530 memset(&action, 0, sizeof(action));
1531 action.sa_handler = SIG_DFL;
1532 if (!sigaction(SIGBUS, &action, NULL)) {
1533 raise(SIGBUS);
1534 sigemptyset(&set);
1535 sigaddset(&set, SIGBUS);
1536 sigprocmask(SIG_UNBLOCK, &set, NULL);
1538 perror("Failed to re-raise SIGBUS!\n");
1539 abort();
1542 static void sigbus_handler(int n, struct qemu_signalfd_siginfo *siginfo,
1543 void *ctx)
1545 #if defined(KVM_CAP_MCE) && defined(TARGET_I386)
1546 if (first_cpu->mcg_cap && siginfo->ssi_addr
1547 && siginfo->ssi_code == BUS_MCEERR_AO) {
1548 uint64_t status;
1549 unsigned long paddr;
1550 CPUState *cenv;
1552 /* Hope we are lucky for AO MCE */
1553 if (do_qemu_ram_addr_from_host((void *)(intptr_t)siginfo->ssi_addr,
1554 &paddr)) {
1555 fprintf(stderr, "Hardware memory error for memory used by "
1556 "QEMU itself instead of guest system!: %llx\n",
1557 (unsigned long long)siginfo->ssi_addr);
1558 return;
1560 status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN
1561 | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S
1562 | 0xc0;
1563 kvm_inject_x86_mce(first_cpu, 9, status,
1564 MCG_STATUS_MCIP | MCG_STATUS_RIPV, paddr,
1565 (MCM_ADDR_PHYS << 6) | 0xc, 1);
1566 for (cenv = first_cpu->next_cpu; cenv != NULL; cenv = cenv->next_cpu)
1567 kvm_inject_x86_mce(cenv, 1, MCI_STATUS_VAL | MCI_STATUS_UC,
1568 MCG_STATUS_MCIP | MCG_STATUS_RIPV, 0, 0, 1);
1569 } else
1570 #endif
1572 if (siginfo->ssi_code == BUS_MCEERR_AO)
1573 return;
1574 else if (siginfo->ssi_code == BUS_MCEERR_AR)
1575 hardware_memory_error();
1576 else
1577 sigbus_reraise();
1581 static void on_vcpu(CPUState *env, void (*func)(void *data), void *data)
1583 struct qemu_work_item wi;
1585 if (env == current_env) {
1586 func(data);
1587 return;
1590 wi.func = func;
1591 wi.data = data;
1592 if (!env->kvm_cpu_state.queued_work_first)
1593 env->kvm_cpu_state.queued_work_first = &wi;
1594 else
1595 env->kvm_cpu_state.queued_work_last->next = &wi;
1596 env->kvm_cpu_state.queued_work_last = &wi;
1597 wi.next = NULL;
1598 wi.done = false;
1600 pthread_kill(env->kvm_cpu_state.thread, SIG_IPI);
1601 while (!wi.done)
1602 qemu_cond_wait(&qemu_work_cond);
1605 void kvm_arch_get_registers(CPUState *env)
1607 kvm_arch_save_regs(env);
1608 kvm_arch_save_mpstate(env);
1609 #ifdef KVM_CAP_MP_STATE
1610 if (kvm_irqchip_in_kernel())
1611 env->halted = (env->mp_state == KVM_MP_STATE_HALTED);
1612 #endif
1615 static void do_kvm_cpu_synchronize_state(void *_env)
1617 CPUState *env = _env;
1618 if (!env->kvm_cpu_state.regs_modified) {
1619 kvm_arch_get_registers(env);
1620 env->kvm_cpu_state.regs_modified = 1;
1624 void kvm_cpu_synchronize_state(CPUState *env)
1626 if (!env->kvm_cpu_state.regs_modified)
1627 on_vcpu(env, do_kvm_cpu_synchronize_state, env);
1630 static void inject_interrupt(void *data)
1632 cpu_interrupt(current_env, (long) data);
1635 void kvm_inject_interrupt(CPUState *env, int mask)
1637 on_vcpu(env, inject_interrupt, (void *) (long) mask);
1640 void kvm_update_interrupt_request(CPUState *env)
1642 int signal = 0;
1644 if (env) {
1645 if (!current_env || !current_env->created)
1646 signal = 1;
1648 * Testing for created here is really redundant
1650 if (current_env && current_env->created &&
1651 env != current_env && !env->kvm_cpu_state.signalled)
1652 signal = 1;
1654 if (signal) {
1655 env->kvm_cpu_state.signalled = 1;
1656 if (env->kvm_cpu_state.thread)
1657 pthread_kill(env->kvm_cpu_state.thread, SIG_IPI);
1662 static void kvm_do_load_registers(void *_env)
1664 CPUState *env = _env;
1666 kvm_arch_load_regs(env);
1669 void kvm_load_registers(CPUState *env)
1671 if (kvm_enabled() && qemu_system_ready)
1672 on_vcpu(env, kvm_do_load_registers, env);
1675 static void kvm_do_save_registers(void *_env)
1677 CPUState *env = _env;
1679 kvm_arch_save_regs(env);
1682 void kvm_save_registers(CPUState *env)
1684 if (kvm_enabled())
1685 on_vcpu(env, kvm_do_save_registers, env);
1688 static void kvm_do_load_mpstate(void *_env)
1690 CPUState *env = _env;
1692 kvm_arch_load_mpstate(env);
1695 void kvm_load_mpstate(CPUState *env)
1697 if (kvm_enabled() && qemu_system_ready)
1698 on_vcpu(env, kvm_do_load_mpstate, env);
1701 static void kvm_do_save_mpstate(void *_env)
1703 CPUState *env = _env;
1705 kvm_arch_save_mpstate(env);
1708 void kvm_save_mpstate(CPUState *env)
1710 if (kvm_enabled())
1711 on_vcpu(env, kvm_do_save_mpstate, env);
1714 int kvm_cpu_exec(CPUState *env)
1716 int r;
1718 r = kvm_run(env->kvm_cpu_state.vcpu_ctx, env);
1719 if (r < 0) {
1720 printf("kvm_run returned %d\n", r);
1721 vm_stop(0);
1724 return 0;
1727 static int is_cpu_stopped(CPUState *env)
1729 return !vm_running || env->stopped;
1732 static void flush_queued_work(CPUState *env)
1734 struct qemu_work_item *wi;
1736 if (!env->kvm_cpu_state.queued_work_first)
1737 return;
1739 while ((wi = env->kvm_cpu_state.queued_work_first)) {
1740 env->kvm_cpu_state.queued_work_first = wi->next;
1741 wi->func(wi->data);
1742 wi->done = true;
1744 env->kvm_cpu_state.queued_work_last = NULL;
1745 pthread_cond_broadcast(&qemu_work_cond);
1748 static void kvm_on_sigbus(CPUState *env, siginfo_t *siginfo)
1750 #if defined(KVM_CAP_MCE) && defined(TARGET_I386)
1751 struct kvm_x86_mce mce = {
1752 .bank = 9,
1754 unsigned long paddr;
1755 int r;
1757 if (env->mcg_cap && siginfo->si_addr
1758 && (siginfo->si_code == BUS_MCEERR_AR
1759 || siginfo->si_code == BUS_MCEERR_AO)) {
1760 if (siginfo->si_code == BUS_MCEERR_AR) {
1761 /* Fake an Intel architectural Data Load SRAR UCR */
1762 mce.status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN
1763 | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S
1764 | MCI_STATUS_AR | 0x134;
1765 mce.misc = (MCM_ADDR_PHYS << 6) | 0xc;
1766 mce.mcg_status = MCG_STATUS_MCIP | MCG_STATUS_EIPV;
1767 } else {
1768 /* Fake an Intel architectural Memory scrubbing UCR */
1769 mce.status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN
1770 | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S
1771 | 0xc0;
1772 mce.misc = (MCM_ADDR_PHYS << 6) | 0xc;
1773 mce.mcg_status = MCG_STATUS_MCIP | MCG_STATUS_RIPV;
1775 if (do_qemu_ram_addr_from_host((void *)siginfo->si_addr, &paddr)) {
1776 fprintf(stderr, "Hardware memory error for memory used by "
1777 "QEMU itself instaed of guest system!\n");
1778 /* Hope we are lucky for AO MCE */
1779 if (siginfo->si_code == BUS_MCEERR_AO)
1780 return;
1781 else
1782 hardware_memory_error();
1784 mce.addr = paddr;
1785 r = kvm_set_mce(env->kvm_cpu_state.vcpu_ctx, &mce);
1786 if (r < 0) {
1787 fprintf(stderr, "kvm_set_mce: %s\n", strerror(errno));
1788 abort();
1790 } else
1791 #endif
1793 if (siginfo->si_code == BUS_MCEERR_AO)
1794 return;
1795 else if (siginfo->si_code == BUS_MCEERR_AR)
1796 hardware_memory_error();
1797 else
1798 sigbus_reraise();
1802 static void kvm_main_loop_wait(CPUState *env, int timeout)
1804 struct timespec ts;
1805 int r, e;
1806 siginfo_t siginfo;
1807 sigset_t waitset;
1808 sigset_t chkset;
1810 ts.tv_sec = timeout / 1000;
1811 ts.tv_nsec = (timeout % 1000) * 1000000;
1812 sigemptyset(&waitset);
1813 sigaddset(&waitset, SIG_IPI);
1814 sigaddset(&waitset, SIGBUS);
1816 do {
1817 pthread_mutex_unlock(&qemu_mutex);
1819 r = sigtimedwait(&waitset, &siginfo, &ts);
1820 e = errno;
1822 pthread_mutex_lock(&qemu_mutex);
1824 if (r == -1 && !(e == EAGAIN || e == EINTR)) {
1825 printf("sigtimedwait: %s\n", strerror(e));
1826 exit(1);
1829 switch (r) {
1830 case SIGBUS:
1831 kvm_on_sigbus(env, &siginfo);
1832 break;
1833 default:
1834 break;
1837 r = sigpending(&chkset);
1838 if (r == -1) {
1839 printf("sigpending: %s\n", strerror(e));
1840 exit(1);
1842 } while (sigismember(&chkset, SIG_IPI) || sigismember(&chkset, SIGBUS));
1844 cpu_single_env = env;
1845 flush_queued_work(env);
1847 if (env->stop) {
1848 env->stop = 0;
1849 env->stopped = 1;
1850 pthread_cond_signal(&qemu_pause_cond);
1853 env->kvm_cpu_state.signalled = 0;
1856 static int all_threads_paused(void)
1858 CPUState *penv = first_cpu;
1860 while (penv) {
1861 if (penv->stop)
1862 return 0;
1863 penv = (CPUState *) penv->next_cpu;
1866 return 1;
1869 static void pause_all_threads(void)
1871 CPUState *penv = first_cpu;
1873 while (penv) {
1874 if (penv != cpu_single_env) {
1875 penv->stop = 1;
1876 pthread_kill(penv->kvm_cpu_state.thread, SIG_IPI);
1877 } else {
1878 penv->stop = 0;
1879 penv->stopped = 1;
1880 cpu_exit(penv);
1882 penv = (CPUState *) penv->next_cpu;
1885 while (!all_threads_paused())
1886 qemu_cond_wait(&qemu_pause_cond);
1889 static void resume_all_threads(void)
1891 CPUState *penv = first_cpu;
1893 assert(!cpu_single_env);
1895 while (penv) {
1896 penv->stop = 0;
1897 penv->stopped = 0;
1898 pthread_kill(penv->kvm_cpu_state.thread, SIG_IPI);
1899 penv = (CPUState *) penv->next_cpu;
1903 static void kvm_vm_state_change_handler(void *context, int running, int reason)
1905 if (running)
1906 resume_all_threads();
1907 else
1908 pause_all_threads();
1911 static void setup_kernel_sigmask(CPUState *env)
1913 sigset_t set;
1915 sigemptyset(&set);
1916 sigaddset(&set, SIGUSR2);
1917 sigaddset(&set, SIGIO);
1918 sigaddset(&set, SIGALRM);
1919 sigprocmask(SIG_BLOCK, &set, NULL);
1921 sigprocmask(SIG_BLOCK, NULL, &set);
1922 sigdelset(&set, SIG_IPI);
1923 sigdelset(&set, SIGBUS);
1925 kvm_set_signal_mask(env->kvm_cpu_state.vcpu_ctx, &set);
1928 static void qemu_kvm_system_reset(void)
1930 CPUState *penv = first_cpu;
1932 pause_all_threads();
1934 qemu_system_reset();
1936 while (penv) {
1937 kvm_arch_cpu_reset(penv);
1938 penv = (CPUState *) penv->next_cpu;
1941 resume_all_threads();
1944 static void process_irqchip_events(CPUState *env)
1946 kvm_arch_process_irqchip_events(env);
1947 if (kvm_arch_has_work(env))
1948 env->halted = 0;
1951 static int kvm_main_loop_cpu(CPUState *env)
1953 setup_kernel_sigmask(env);
1955 pthread_mutex_lock(&qemu_mutex);
1957 kvm_arch_init_vcpu(env);
1958 #ifdef TARGET_I386
1959 kvm_tpr_vcpu_start(env);
1960 #endif
1962 cpu_single_env = env;
1963 kvm_arch_load_regs(env);
1965 while (1) {
1966 int run_cpu = !is_cpu_stopped(env);
1967 if (run_cpu && !kvm_irqchip_in_kernel()) {
1968 process_irqchip_events(env);
1969 run_cpu = !env->halted;
1971 if (run_cpu) {
1972 kvm_main_loop_wait(env, 0);
1973 kvm_cpu_exec(env);
1974 } else {
1975 kvm_main_loop_wait(env, 1000);
1978 pthread_mutex_unlock(&qemu_mutex);
1979 return 0;
1982 static void *ap_main_loop(void *_env)
1984 CPUState *env = _env;
1985 sigset_t signals;
1986 #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
1987 struct ioperm_data *data = NULL;
1988 #endif
1990 current_env = env;
1991 env->thread_id = kvm_get_thread_id();
1992 sigfillset(&signals);
1993 sigprocmask(SIG_BLOCK, &signals, NULL);
1994 env->kvm_cpu_state.vcpu_ctx = kvm_create_vcpu(env, env->cpu_index);
1996 #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
1997 /* do ioperm for io ports of assigned devices */
1998 QLIST_FOREACH(data, &ioperm_head, entries)
1999 on_vcpu(env, kvm_arch_do_ioperm, data);
2000 #endif
2002 /* signal VCPU creation */
2003 pthread_mutex_lock(&qemu_mutex);
2004 current_env->created = 1;
2005 pthread_cond_signal(&qemu_vcpu_cond);
2007 /* and wait for machine initialization */
2008 while (!qemu_system_ready)
2009 qemu_cond_wait(&qemu_system_cond);
2010 pthread_mutex_unlock(&qemu_mutex);
2012 kvm_main_loop_cpu(env);
2013 return NULL;
2016 void kvm_init_vcpu(CPUState *env)
2018 pthread_create(&env->kvm_cpu_state.thread, NULL, ap_main_loop, env);
2020 while (env->created == 0)
2021 qemu_cond_wait(&qemu_vcpu_cond);
2024 int kvm_vcpu_inited(CPUState *env)
2026 return env->created;
2029 #ifdef TARGET_I386
2030 void kvm_hpet_disable_kpit(void)
2032 struct kvm_pit_state2 ps2;
2034 kvm_get_pit2(kvm_context, &ps2);
2035 ps2.flags |= KVM_PIT_FLAGS_HPET_LEGACY;
2036 kvm_set_pit2(kvm_context, &ps2);
2039 void kvm_hpet_enable_kpit(void)
2041 struct kvm_pit_state2 ps2;
2043 kvm_get_pit2(kvm_context, &ps2);
2044 ps2.flags &= ~KVM_PIT_FLAGS_HPET_LEGACY;
2045 kvm_set_pit2(kvm_context, &ps2);
2047 #endif
2049 int kvm_init_ap(void)
2051 struct sigaction action;
2053 #ifdef TARGET_I386
2054 kvm_tpr_opt_setup();
2055 #endif
2056 qemu_add_vm_change_state_handler(kvm_vm_state_change_handler, NULL);
2058 signal(SIG_IPI, sig_ipi_handler);
2060 memset(&action, 0, sizeof(action));
2061 action.sa_flags = SA_SIGINFO;
2062 action.sa_sigaction = (void (*)(int, siginfo_t*, void*))sigbus_handler;
2063 sigaction(SIGBUS, &action, NULL);
2064 prctl(PR_MCE_KILL, 1, 1);
2065 return 0;
2068 void qemu_kvm_notify_work(void)
2070 uint64_t value = 1;
2071 char buffer[8];
2072 size_t offset = 0;
2074 if (io_thread_fd == -1)
2075 return;
2077 memcpy(buffer, &value, sizeof(value));
2079 while (offset < 8) {
2080 ssize_t len;
2082 len = write(io_thread_fd, buffer + offset, 8 - offset);
2083 if (len == -1 && errno == EINTR)
2084 continue;
2086 /* In case we have a pipe, there is not reason to insist writing
2087 * 8 bytes
2089 if (len == -1 && errno == EAGAIN)
2090 break;
2092 if (len <= 0)
2093 break;
2095 offset += len;
2099 /* If we have signalfd, we mask out the signals we want to handle and then
2100 * use signalfd to listen for them. We rely on whatever the current signal
2101 * handler is to dispatch the signals when we receive them.
2104 static void sigfd_handler(void *opaque)
2106 int fd = (unsigned long) opaque;
2107 struct qemu_signalfd_siginfo info;
2108 struct sigaction action;
2109 ssize_t len;
2111 while (1) {
2112 do {
2113 len = read(fd, &info, sizeof(info));
2114 } while (len == -1 && errno == EINTR);
2116 if (len == -1 && errno == EAGAIN)
2117 break;
2119 if (len != sizeof(info)) {
2120 printf("read from sigfd returned %zd: %m\n", len);
2121 return;
2124 sigaction(info.ssi_signo, NULL, &action);
2125 if ((action.sa_flags & SA_SIGINFO) && action.sa_sigaction)
2126 action.sa_sigaction(info.ssi_signo,
2127 (siginfo_t *)&info, NULL);
2128 else if (action.sa_handler)
2129 action.sa_handler(info.ssi_signo);
2134 /* Used to break IO thread out of select */
2135 static void io_thread_wakeup(void *opaque)
2137 int fd = (unsigned long) opaque;
2138 char buffer[4096];
2140 /* Drain the pipe/(eventfd) */
2141 while (1) {
2142 ssize_t len;
2144 len = read(fd, buffer, sizeof(buffer));
2145 if (len == -1 && errno == EINTR)
2146 continue;
2148 if (len <= 0)
2149 break;
2153 int kvm_main_loop(void)
2155 int fds[2];
2156 sigset_t mask;
2157 int sigfd;
2159 io_thread = pthread_self();
2160 qemu_system_ready = 1;
2162 if (qemu_eventfd(fds) == -1) {
2163 fprintf(stderr, "failed to create eventfd\n");
2164 return -errno;
2167 fcntl(fds[0], F_SETFL, O_NONBLOCK);
2168 fcntl(fds[1], F_SETFL, O_NONBLOCK);
2170 qemu_set_fd_handler2(fds[0], NULL, io_thread_wakeup, NULL,
2171 (void *)(unsigned long) fds[0]);
2173 io_thread_fd = fds[1];
2175 sigemptyset(&mask);
2176 sigaddset(&mask, SIGIO);
2177 sigaddset(&mask, SIGALRM);
2178 sigaddset(&mask, SIGBUS);
2179 sigprocmask(SIG_BLOCK, &mask, NULL);
2181 sigfd = qemu_signalfd(&mask);
2182 if (sigfd == -1) {
2183 fprintf(stderr, "failed to create signalfd\n");
2184 return -errno;
2187 fcntl(sigfd, F_SETFL, O_NONBLOCK);
2189 qemu_set_fd_handler2(sigfd, NULL, sigfd_handler, NULL,
2190 (void *)(unsigned long) sigfd);
2192 pthread_cond_broadcast(&qemu_system_cond);
2194 io_thread_sigfd = sigfd;
2195 cpu_single_env = NULL;
2197 while (1) {
2198 main_loop_wait(1000);
2199 if (qemu_shutdown_requested()) {
2200 if (qemu_no_shutdown()) {
2201 vm_stop(0);
2202 } else
2203 break;
2204 } else if (qemu_powerdown_requested())
2205 qemu_irq_raise(qemu_system_powerdown);
2206 else if (qemu_reset_requested())
2207 qemu_kvm_system_reset();
2208 else if (kvm_debug_cpu_requested) {
2209 gdb_set_stop_cpu(kvm_debug_cpu_requested);
2210 vm_stop(EXCP_DEBUG);
2211 kvm_debug_cpu_requested = NULL;
2215 pause_all_threads();
2216 pthread_mutex_unlock(&qemu_mutex);
2218 return 0;
2221 #ifdef TARGET_I386
2222 static int destroy_region_works = 0;
2223 #endif
2226 #if !defined(TARGET_I386)
2227 int kvm_arch_init_irq_routing(void)
2229 return 0;
2231 #endif
2233 extern int no_hpet;
2235 static int kvm_create_context(void)
2237 int r;
2239 if (!kvm_irqchip) {
2240 kvm_disable_irqchip_creation(kvm_context);
2242 if (!kvm_pit) {
2243 kvm_disable_pit_creation(kvm_context);
2245 if (kvm_create(kvm_context, 0, NULL) < 0) {
2246 kvm_finalize(kvm_state);
2247 return -1;
2249 r = kvm_arch_qemu_create_context();
2250 if (r < 0)
2251 kvm_finalize(kvm_state);
2252 if (kvm_pit && !kvm_pit_reinject) {
2253 if (kvm_reinject_control(kvm_context, 0)) {
2254 fprintf(stderr, "failure to disable in-kernel PIT reinjection\n");
2255 return -1;
2258 #ifdef TARGET_I386
2259 destroy_region_works = kvm_destroy_memory_region_works(kvm_context);
2260 #endif
2262 r = kvm_arch_init_irq_routing();
2263 if (r < 0) {
2264 return r;
2267 kvm_init_ap();
2268 if (kvm_irqchip) {
2269 if (!qemu_kvm_has_gsi_routing()) {
2270 irq0override = 0;
2271 #ifdef TARGET_I386
2272 /* if kernel can't do irq routing, interrupt source
2273 * override 0->2 can not be set up as required by hpet,
2274 * so disable hpet.
2276 no_hpet = 1;
2277 } else if (!qemu_kvm_has_pit_state2()) {
2278 no_hpet = 1;
2280 #else
2282 #endif
2285 return 0;
2288 #ifdef TARGET_I386
2289 static int must_use_aliases_source(target_phys_addr_t addr)
2291 if (destroy_region_works)
2292 return false;
2293 if (addr == 0xa0000 || addr == 0xa8000)
2294 return true;
2295 return false;
2298 static int must_use_aliases_target(target_phys_addr_t addr)
2300 if (destroy_region_works)
2301 return false;
2302 if (addr >= 0xe0000000 && addr < 0x100000000ull)
2303 return true;
2304 return false;
2307 static struct mapping {
2308 target_phys_addr_t phys;
2309 ram_addr_t ram;
2310 ram_addr_t len;
2311 } mappings[50];
2312 static int nr_mappings;
2314 static struct mapping *find_ram_mapping(ram_addr_t ram_addr)
2316 struct mapping *p;
2318 for (p = mappings; p < mappings + nr_mappings; ++p) {
2319 if (p->ram <= ram_addr && ram_addr < p->ram + p->len) {
2320 return p;
2323 return NULL;
2326 static struct mapping *find_mapping(target_phys_addr_t start_addr)
2328 struct mapping *p;
2330 for (p = mappings; p < mappings + nr_mappings; ++p) {
2331 if (p->phys <= start_addr && start_addr < p->phys + p->len) {
2332 return p;
2335 return NULL;
2338 static void drop_mapping(target_phys_addr_t start_addr)
2340 struct mapping *p = find_mapping(start_addr);
2342 if (p)
2343 *p = mappings[--nr_mappings];
2345 #endif
2347 void kvm_set_phys_mem(target_phys_addr_t start_addr, ram_addr_t size,
2348 ram_addr_t phys_offset)
2350 int r = 0;
2351 unsigned long area_flags;
2352 #ifdef TARGET_I386
2353 struct mapping *p;
2354 #endif
2356 if (start_addr + size > phys_ram_size) {
2357 phys_ram_size = start_addr + size;
2360 phys_offset &= ~IO_MEM_ROM;
2361 area_flags = phys_offset & ~TARGET_PAGE_MASK;
2363 if (area_flags != IO_MEM_RAM) {
2364 #ifdef TARGET_I386
2365 if (must_use_aliases_source(start_addr)) {
2366 kvm_destroy_memory_alias(kvm_context, start_addr);
2367 return;
2369 if (must_use_aliases_target(start_addr))
2370 return;
2371 #endif
2372 while (size > 0) {
2373 p = find_mapping(start_addr);
2374 if (p) {
2375 kvm_unregister_memory_area(kvm_context, p->phys, p->len);
2376 drop_mapping(p->phys);
2378 start_addr += TARGET_PAGE_SIZE;
2379 if (size > TARGET_PAGE_SIZE) {
2380 size -= TARGET_PAGE_SIZE;
2381 } else {
2382 size = 0;
2385 return;
2388 r = kvm_is_containing_region(kvm_context, start_addr, size);
2389 if (r)
2390 return;
2392 if (area_flags >= TLB_MMIO)
2393 return;
2395 #ifdef TARGET_I386
2396 if (must_use_aliases_source(start_addr)) {
2397 p = find_ram_mapping(phys_offset);
2398 if (p) {
2399 kvm_create_memory_alias(kvm_context, start_addr, size,
2400 p->phys + (phys_offset - p->ram));
2402 return;
2404 #endif
2406 r = kvm_register_phys_mem(kvm_context, start_addr,
2407 qemu_get_ram_ptr(phys_offset), size, 0);
2408 if (r < 0) {
2409 printf("kvm_cpu_register_physical_memory: failed\n");
2410 exit(1);
2412 #ifdef TARGET_I386
2413 drop_mapping(start_addr);
2414 p = &mappings[nr_mappings++];
2415 p->phys = start_addr;
2416 p->ram = phys_offset;
2417 p->len = size;
2418 #endif
2420 return;
2423 int kvm_setup_guest_memory(void *area, unsigned long size)
2425 int ret = 0;
2427 #ifdef MADV_DONTFORK
2428 if (kvm_enabled() && !kvm_has_sync_mmu())
2429 ret = madvise(area, size, MADV_DONTFORK);
2430 #endif
2432 if (ret)
2433 perror("madvise");
2435 return ret;
2438 #ifdef KVM_CAP_SET_GUEST_DEBUG
2440 struct kvm_set_guest_debug_data {
2441 struct kvm_guest_debug dbg;
2442 int err;
2445 static void kvm_invoke_set_guest_debug(void *data)
2447 struct kvm_set_guest_debug_data *dbg_data = data;
2449 if (cpu_single_env->kvm_cpu_state.regs_modified) {
2450 kvm_arch_put_registers(cpu_single_env);
2451 cpu_single_env->kvm_cpu_state.regs_modified = 0;
2453 dbg_data->err =
2454 kvm_set_guest_debug(cpu_single_env->kvm_cpu_state.vcpu_ctx,
2455 &dbg_data->dbg);
2458 int kvm_update_guest_debug(CPUState *env, unsigned long reinject_trap)
2460 struct kvm_set_guest_debug_data data;
2462 data.dbg.control = 0;
2463 if (env->singlestep_enabled)
2464 data.dbg.control = KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP;
2466 kvm_arch_update_guest_debug(env, &data.dbg);
2467 data.dbg.control |= reinject_trap;
2469 on_vcpu(env, kvm_invoke_set_guest_debug, &data);
2470 return data.err;
2473 #endif
2476 * dirty pages logging
2478 /* FIXME: use unsigned long pointer instead of unsigned char */
2479 unsigned char *kvm_dirty_bitmap = NULL;
2480 int kvm_physical_memory_set_dirty_tracking(int enable)
2482 int r = 0;
2484 if (!kvm_enabled())
2485 return 0;
2487 if (enable) {
2488 if (!kvm_dirty_bitmap) {
2489 unsigned bitmap_size = BITMAP_SIZE(phys_ram_size);
2490 kvm_dirty_bitmap = qemu_malloc(bitmap_size);
2491 r = kvm_dirty_pages_log_enable_all(kvm_context);
2493 } else {
2494 if (kvm_dirty_bitmap) {
2495 r = kvm_dirty_pages_log_reset(kvm_context);
2496 qemu_free(kvm_dirty_bitmap);
2497 kvm_dirty_bitmap = NULL;
2500 return r;
2503 /* get kvm's dirty pages bitmap and update qemu's */
2504 static int kvm_get_dirty_pages_log_range(unsigned long start_addr,
2505 unsigned char *bitmap,
2506 unsigned long offset,
2507 unsigned long mem_size)
2509 unsigned int i, j, n = 0;
2510 unsigned char c;
2511 unsigned long page_number, addr, addr1;
2512 ram_addr_t ram_addr;
2513 unsigned int len = ((mem_size / TARGET_PAGE_SIZE) + 7) / 8;
2516 * bitmap-traveling is faster than memory-traveling (for addr...)
2517 * especially when most of the memory is not dirty.
2519 for (i = 0; i < len; i++) {
2520 c = bitmap[i];
2521 while (c > 0) {
2522 j = ffsl(c) - 1;
2523 c &= ~(1u << j);
2524 page_number = i * 8 + j;
2525 addr1 = page_number * TARGET_PAGE_SIZE;
2526 addr = offset + addr1;
2527 ram_addr = cpu_get_physical_page_desc(addr);
2528 cpu_physical_memory_set_dirty(ram_addr);
2529 n++;
2532 return 0;
2535 static int kvm_get_dirty_bitmap_cb(unsigned long start, unsigned long len,
2536 void *bitmap, void *opaque)
2538 return kvm_get_dirty_pages_log_range(start, bitmap, start, len);
2542 * get kvm's dirty pages bitmap and update qemu's
2543 * we only care about physical ram, which resides in slots 0 and 3
2545 int kvm_update_dirty_pages_log(void)
2547 int r = 0;
2550 r = kvm_get_dirty_pages_range(kvm_context, 0, -1UL, NULL,
2551 kvm_get_dirty_bitmap_cb);
2552 return r;
2555 void kvm_qemu_log_memory(target_phys_addr_t start, target_phys_addr_t size,
2556 int log)
2558 if (log)
2559 kvm_dirty_pages_log_enable_slot(kvm_context, start, size);
2560 else {
2561 #ifdef TARGET_I386
2562 if (must_use_aliases_target(start))
2563 return;
2564 #endif
2565 kvm_dirty_pages_log_disable_slot(kvm_context, start, size);
2569 #ifdef KVM_CAP_IRQCHIP
2571 int kvm_set_irq(int irq, int level, int *status)
2573 return kvm_set_irq_level(kvm_context, irq, level, status);
2576 #endif
2578 int qemu_kvm_get_dirty_pages(unsigned long phys_addr, void *buf)
2580 return kvm_get_dirty_pages(kvm_context, phys_addr, buf);
2583 void kvm_mutex_unlock(void)
2585 assert(!cpu_single_env);
2586 pthread_mutex_unlock(&qemu_mutex);
2589 void kvm_mutex_lock(void)
2591 pthread_mutex_lock(&qemu_mutex);
2592 cpu_single_env = NULL;
2595 #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
2596 void kvm_add_ioperm_data(struct ioperm_data *data)
2598 QLIST_INSERT_HEAD(&ioperm_head, data, entries);
2601 void kvm_remove_ioperm_data(unsigned long start_port, unsigned long num)
2603 struct ioperm_data *data;
2605 data = QLIST_FIRST(&ioperm_head);
2606 while (data) {
2607 struct ioperm_data *next = QLIST_NEXT(data, entries);
2609 if (data->start_port == start_port && data->num == num) {
2610 QLIST_REMOVE(data, entries);
2611 qemu_free(data);
2614 data = next;
2618 void kvm_ioperm(CPUState *env, void *data)
2620 if (kvm_enabled() && qemu_system_ready)
2621 on_vcpu(env, kvm_arch_do_ioperm, data);
2624 #endif
2626 int kvm_physical_sync_dirty_bitmap(target_phys_addr_t start_addr,
2627 target_phys_addr_t end_addr)
2629 #ifndef TARGET_IA64
2631 #ifdef TARGET_I386
2632 if (must_use_aliases_source(start_addr))
2633 return 0;
2634 #endif
2636 kvm_get_dirty_pages_range(kvm_context, start_addr,
2637 end_addr - start_addr, NULL,
2638 kvm_get_dirty_bitmap_cb);
2639 #endif
2640 return 0;
2643 int kvm_log_start(target_phys_addr_t phys_addr, target_phys_addr_t len)
2645 #ifdef TARGET_I386
2646 if (must_use_aliases_source(phys_addr))
2647 return 0;
2648 #endif
2650 #ifndef TARGET_IA64
2651 kvm_qemu_log_memory(phys_addr, len, 1);
2652 #endif
2653 return 0;
2656 int kvm_log_stop(target_phys_addr_t phys_addr, target_phys_addr_t len)
2658 #ifdef TARGET_I386
2659 if (must_use_aliases_source(phys_addr))
2660 return 0;
2661 #endif
2663 #ifndef TARGET_IA64
2664 kvm_qemu_log_memory(phys_addr, len, 0);
2665 #endif
2666 return 0;
2669 int kvm_set_boot_cpu_id(uint32_t id)
2671 return kvm_set_boot_vcpu_id(kvm_context, id);
2674 #ifdef TARGET_I386
2675 #ifdef KVM_CAP_MCE
2676 struct kvm_x86_mce_data {
2677 CPUState *env;
2678 struct kvm_x86_mce *mce;
2679 int abort_on_error;
2682 static void kvm_do_inject_x86_mce(void *_data)
2684 struct kvm_x86_mce_data *data = _data;
2685 int r;
2687 r = kvm_set_mce(data->env->kvm_cpu_state.vcpu_ctx, data->mce);
2688 if (r < 0) {
2689 perror("kvm_set_mce FAILED");
2690 if (data->abort_on_error)
2691 abort();
2694 #endif
2696 void kvm_inject_x86_mce(CPUState *cenv, int bank, uint64_t status,
2697 uint64_t mcg_status, uint64_t addr, uint64_t misc,
2698 int abort_on_error)
2700 #ifdef KVM_CAP_MCE
2701 struct kvm_x86_mce mce = {
2702 .bank = bank,
2703 .status = status,
2704 .mcg_status = mcg_status,
2705 .addr = addr,
2706 .misc = misc,
2708 struct kvm_x86_mce_data data = {
2709 .env = cenv,
2710 .mce = &mce,
2711 .abort_on_error = abort_on_error,
2714 if (!cenv->mcg_cap) {
2715 fprintf(stderr, "MCE support is not enabled!\n");
2716 return;
2718 on_vcpu(cenv, kvm_do_inject_x86_mce, &data);
2719 #else
2720 if (abort_on_error)
2721 abort();
2722 #endif
2724 #endif