KVM: Request setting of nmi_pending and sipi_vector
[qemu-kvm/amd-iommu.git] / qemu-kvm.c
bloba305907753914cb8e90204f02288a184d5a0f518
1 /*
2 * qemu/kvm integration
4 * Copyright (C) 2006-2008 Qumranet Technologies
6 * Licensed under the terms of the GNU GPL version 2 or higher.
7 */
8 #include "config.h"
9 #include "config-host.h"
11 #include <assert.h>
12 #include <string.h>
13 #include "hw/hw.h"
14 #include "sysemu.h"
15 #include "qemu-common.h"
16 #include "console.h"
17 #include "block.h"
18 #include "compatfd.h"
19 #include "gdbstub.h"
20 #include "monitor.h"
22 #include "qemu-kvm.h"
23 #include "libkvm.h"
25 #include <pthread.h>
26 #include <sys/utsname.h>
27 #include <sys/syscall.h>
28 #include <sys/mman.h>
29 #include <sys/ioctl.h>
30 #include "compatfd.h"
31 #include <sys/prctl.h>
33 #define false 0
34 #define true 1
36 #ifndef PR_MCE_KILL
37 #define PR_MCE_KILL 33
38 #endif
40 #ifndef BUS_MCEERR_AR
41 #define BUS_MCEERR_AR 4
42 #endif
43 #ifndef BUS_MCEERR_AO
44 #define BUS_MCEERR_AO 5
45 #endif
47 #define EXPECTED_KVM_API_VERSION 12
49 #if EXPECTED_KVM_API_VERSION != KVM_API_VERSION
50 #error libkvm: userspace and kernel version mismatch
51 #endif
53 int kvm_allowed = 1;
54 int kvm_irqchip = 1;
55 int kvm_pit = 1;
56 int kvm_pit_reinject = 1;
57 int kvm_nested = 0;
60 KVMState *kvm_state;
61 kvm_context_t kvm_context;
63 pthread_mutex_t qemu_mutex = PTHREAD_MUTEX_INITIALIZER;
64 pthread_cond_t qemu_vcpu_cond = PTHREAD_COND_INITIALIZER;
65 pthread_cond_t qemu_system_cond = PTHREAD_COND_INITIALIZER;
66 pthread_cond_t qemu_pause_cond = PTHREAD_COND_INITIALIZER;
67 pthread_cond_t qemu_work_cond = PTHREAD_COND_INITIALIZER;
68 __thread CPUState *current_env;
70 static int qemu_system_ready;
72 #define SIG_IPI (SIGRTMIN+4)
74 pthread_t io_thread;
75 static int io_thread_fd = -1;
76 static int io_thread_sigfd = -1;
78 static CPUState *kvm_debug_cpu_requested;
80 static uint64_t phys_ram_size;
82 #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
83 /* The list of ioperm_data */
84 static QLIST_HEAD(, ioperm_data) ioperm_head;
85 #endif
87 //#define DEBUG_MEMREG
88 #ifdef DEBUG_MEMREG
89 #define DPRINTF(fmt, args...) \
90 do { fprintf(stderr, "%s:%d " fmt , __func__, __LINE__, ##args); } while (0)
91 #else
92 #define DPRINTF(fmt, args...) do {} while (0)
93 #endif
95 #define ALIGN(x, y) (((x)+(y)-1) & ~((y)-1))
97 int kvm_abi = EXPECTED_KVM_API_VERSION;
98 int kvm_page_size;
100 #ifdef KVM_CAP_SET_GUEST_DEBUG
101 static int kvm_debug(CPUState *env,
102 struct kvm_debug_exit_arch *arch_info)
104 int handle = kvm_arch_debug(arch_info);
106 if (handle) {
107 kvm_debug_cpu_requested = env;
108 env->stopped = 1;
110 return handle;
112 #endif
114 static int handle_unhandled(uint64_t reason)
116 fprintf(stderr, "kvm: unhandled exit %" PRIx64 "\n", reason);
117 return -EINVAL;
121 static inline void set_gsi(kvm_context_t kvm, unsigned int gsi)
123 uint32_t *bitmap = kvm->used_gsi_bitmap;
125 if (gsi < kvm->max_gsi)
126 bitmap[gsi / 32] |= 1U << (gsi % 32);
127 else
128 DPRINTF("Invalid GSI %u\n", gsi);
131 static inline void clear_gsi(kvm_context_t kvm, unsigned int gsi)
133 uint32_t *bitmap = kvm->used_gsi_bitmap;
135 if (gsi < kvm->max_gsi)
136 bitmap[gsi / 32] &= ~(1U << (gsi % 32));
137 else
138 DPRINTF("Invalid GSI %u\n", gsi);
141 struct slot_info {
142 unsigned long phys_addr;
143 unsigned long len;
144 unsigned long userspace_addr;
145 unsigned flags;
146 int logging_count;
149 struct slot_info slots[KVM_MAX_NUM_MEM_REGIONS];
151 static void init_slots(void)
153 int i;
155 for (i = 0; i < KVM_MAX_NUM_MEM_REGIONS; ++i)
156 slots[i].len = 0;
159 static int get_free_slot(kvm_context_t kvm)
161 int i;
162 int tss_ext;
164 #if defined(KVM_CAP_SET_TSS_ADDR) && !defined(__s390__)
165 tss_ext = kvm_ioctl(kvm_state, KVM_CHECK_EXTENSION, KVM_CAP_SET_TSS_ADDR);
166 #else
167 tss_ext = 0;
168 #endif
171 * on older kernels where the set tss ioctl is not supprted we must save
172 * slot 0 to hold the extended memory, as the vmx will use the last 3
173 * pages of this slot.
175 if (tss_ext > 0)
176 i = 0;
177 else
178 i = 1;
180 for (; i < KVM_MAX_NUM_MEM_REGIONS; ++i)
181 if (!slots[i].len)
182 return i;
183 return -1;
186 static void register_slot(int slot, unsigned long phys_addr,
187 unsigned long len, unsigned long userspace_addr,
188 unsigned flags)
190 slots[slot].phys_addr = phys_addr;
191 slots[slot].len = len;
192 slots[slot].userspace_addr = userspace_addr;
193 slots[slot].flags = flags;
196 static void free_slot(int slot)
198 slots[slot].len = 0;
199 slots[slot].logging_count = 0;
202 static int get_slot(unsigned long phys_addr)
204 int i;
206 for (i = 0; i < KVM_MAX_NUM_MEM_REGIONS; ++i) {
207 if (slots[i].len && slots[i].phys_addr <= phys_addr &&
208 (slots[i].phys_addr + slots[i].len - 1) >= phys_addr)
209 return i;
211 return -1;
214 /* Returns -1 if this slot is not totally contained on any other,
215 * and the number of the slot otherwise */
216 static int get_container_slot(uint64_t phys_addr, unsigned long size)
218 int i;
220 for (i = 0; i < KVM_MAX_NUM_MEM_REGIONS; ++i)
221 if (slots[i].len && slots[i].phys_addr <= phys_addr &&
222 (slots[i].phys_addr + slots[i].len) >= phys_addr + size)
223 return i;
224 return -1;
227 int kvm_is_containing_region(kvm_context_t kvm, unsigned long phys_addr,
228 unsigned long size)
230 int slot = get_container_slot(phys_addr, size);
231 if (slot == -1)
232 return 0;
233 return 1;
237 * dirty pages logging control
239 static int kvm_dirty_pages_log_change(kvm_context_t kvm,
240 unsigned long phys_addr, unsigned flags,
241 unsigned mask)
243 int r = -1;
244 int slot = get_slot(phys_addr);
246 if (slot == -1) {
247 fprintf(stderr, "BUG: %s: invalid parameters\n", __FUNCTION__);
248 return 1;
251 flags = (slots[slot].flags & ~mask) | flags;
252 if (flags == slots[slot].flags)
253 return 0;
254 slots[slot].flags = flags;
257 struct kvm_userspace_memory_region mem = {
258 .slot = slot,
259 .memory_size = slots[slot].len,
260 .guest_phys_addr = slots[slot].phys_addr,
261 .userspace_addr = slots[slot].userspace_addr,
262 .flags = slots[slot].flags,
266 DPRINTF("slot %d start %llx len %llx flags %x\n",
267 mem.slot, mem.guest_phys_addr, mem.memory_size, mem.flags);
268 r = kvm_vm_ioctl(kvm_state, KVM_SET_USER_MEMORY_REGION, &mem);
269 if (r < 0)
270 fprintf(stderr, "%s: %m\n", __FUNCTION__);
272 return r;
275 static int kvm_dirty_pages_log_change_all(kvm_context_t kvm,
276 int (*change)(kvm_context_t kvm,
277 uint64_t start,
278 uint64_t len))
280 int i, r;
282 for (i = r = 0; i < KVM_MAX_NUM_MEM_REGIONS && r == 0; i++) {
283 if (slots[i].len)
284 r = change(kvm, slots[i].phys_addr, slots[i].len);
286 return r;
289 int kvm_dirty_pages_log_enable_slot(kvm_context_t kvm, uint64_t phys_addr,
290 uint64_t len)
292 int slot = get_slot(phys_addr);
294 DPRINTF("start %" PRIx64 " len %" PRIx64 "\n", phys_addr, len);
295 if (slot == -1) {
296 fprintf(stderr, "BUG: %s: invalid parameters\n", __func__);
297 return -EINVAL;
300 if (slots[slot].logging_count++)
301 return 0;
303 return kvm_dirty_pages_log_change(kvm, slots[slot].phys_addr,
304 KVM_MEM_LOG_DIRTY_PAGES,
305 KVM_MEM_LOG_DIRTY_PAGES);
308 int kvm_dirty_pages_log_disable_slot(kvm_context_t kvm, uint64_t phys_addr,
309 uint64_t len)
311 int slot = get_slot(phys_addr);
313 if (slot == -1) {
314 fprintf(stderr, "BUG: %s: invalid parameters\n", __func__);
315 return -EINVAL;
318 if (--slots[slot].logging_count)
319 return 0;
321 return kvm_dirty_pages_log_change(kvm, slots[slot].phys_addr, 0,
322 KVM_MEM_LOG_DIRTY_PAGES);
326 * Enable dirty page logging for all memory regions
328 int kvm_dirty_pages_log_enable_all(kvm_context_t kvm)
330 if (kvm->dirty_pages_log_all)
331 return 0;
332 kvm->dirty_pages_log_all = 1;
333 return kvm_dirty_pages_log_change_all(kvm, kvm_dirty_pages_log_enable_slot);
337 * Enable dirty page logging only for memory regions that were created with
338 * dirty logging enabled (disable for all other memory regions).
340 int kvm_dirty_pages_log_reset(kvm_context_t kvm)
342 if (!kvm->dirty_pages_log_all)
343 return 0;
344 kvm->dirty_pages_log_all = 0;
345 return kvm_dirty_pages_log_change_all(kvm,
346 kvm_dirty_pages_log_disable_slot);
350 static int kvm_create_context(void);
352 int kvm_init(int smp_cpus)
354 int fd;
355 int r, gsi_count;
358 fd = open("/dev/kvm", O_RDWR);
359 if (fd == -1) {
360 perror("open /dev/kvm");
361 return -1;
363 r = ioctl(fd, KVM_GET_API_VERSION, 0);
364 if (r == -1) {
365 fprintf(stderr,
366 "kvm kernel version too old: "
367 "KVM_GET_API_VERSION ioctl not supported\n");
368 goto out_close;
370 if (r < EXPECTED_KVM_API_VERSION) {
371 fprintf(stderr, "kvm kernel version too old: "
372 "We expect API version %d or newer, but got "
373 "version %d\n", EXPECTED_KVM_API_VERSION, r);
374 goto out_close;
376 if (r > EXPECTED_KVM_API_VERSION) {
377 fprintf(stderr, "kvm userspace version too old\n");
378 goto out_close;
380 kvm_abi = r;
381 kvm_page_size = getpagesize();
382 kvm_state = qemu_mallocz(sizeof(*kvm_state));
383 kvm_context = &kvm_state->kvm_context;
385 kvm_state->fd = fd;
386 kvm_state->vmfd = -1;
387 kvm_context->opaque = cpu_single_env;
388 kvm_context->dirty_pages_log_all = 0;
389 kvm_context->no_irqchip_creation = 0;
390 kvm_context->no_pit_creation = 0;
392 #ifdef KVM_CAP_SET_GUEST_DEBUG
393 QTAILQ_INIT(&kvm_state->kvm_sw_breakpoints);
394 #endif
396 gsi_count = kvm_get_gsi_count(kvm_context);
397 if (gsi_count > 0) {
398 int gsi_bits, i;
400 /* Round up so we can search ints using ffs */
401 gsi_bits = ALIGN(gsi_count, 32);
402 kvm_context->used_gsi_bitmap = qemu_mallocz(gsi_bits / 8);
403 kvm_context->max_gsi = gsi_bits;
405 /* Mark any over-allocated bits as already in use */
406 for (i = gsi_count; i < gsi_bits; i++)
407 set_gsi(kvm_context, i);
410 pthread_mutex_lock(&qemu_mutex);
411 return kvm_create_context();
413 out_close:
414 close(fd);
415 return -1;
418 static void kvm_finalize(KVMState *s)
420 /* FIXME
421 if (kvm->vcpu_fd[0] != -1)
422 close(kvm->vcpu_fd[0]);
423 if (kvm->vm_fd != -1)
424 close(kvm->vm_fd);
426 close(s->fd);
427 free(s);
430 void kvm_disable_irqchip_creation(kvm_context_t kvm)
432 kvm->no_irqchip_creation = 1;
435 void kvm_disable_pit_creation(kvm_context_t kvm)
437 kvm->no_pit_creation = 1;
440 static void kvm_create_vcpu(CPUState *env, int id)
442 long mmap_size;
443 int r;
445 r = kvm_vm_ioctl(kvm_state, KVM_CREATE_VCPU, id);
446 if (r < 0) {
447 fprintf(stderr, "kvm_create_vcpu: %m\n");
448 return;
451 env->kvm_fd = r;
452 env->kvm_state = kvm_state;
454 mmap_size = kvm_ioctl(kvm_state, KVM_GET_VCPU_MMAP_SIZE, 0);
455 if (mmap_size < 0) {
456 fprintf(stderr, "get vcpu mmap size: %m\n");
457 goto err_fd;
459 env->kvm_run =
460 mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, env->kvm_fd,
462 if (env->kvm_run == MAP_FAILED) {
463 fprintf(stderr, "mmap vcpu area: %m\n");
464 goto err_fd;
467 return;
468 err_fd:
469 close(env->kvm_fd);
472 static int kvm_set_boot_vcpu_id(kvm_context_t kvm, uint32_t id)
474 #ifdef KVM_CAP_SET_BOOT_CPU_ID
475 int r = kvm_ioctl(kvm_state, KVM_CHECK_EXTENSION, KVM_CAP_SET_BOOT_CPU_ID);
476 if (r > 0)
477 return kvm_vm_ioctl(kvm_state, KVM_SET_BOOT_CPU_ID, id);
478 return -ENOSYS;
479 #else
480 return -ENOSYS;
481 #endif
484 int kvm_create_vm(kvm_context_t kvm)
486 int fd;
487 #ifdef KVM_CAP_IRQ_ROUTING
488 kvm->irq_routes = qemu_mallocz(sizeof(*kvm->irq_routes));
489 kvm->nr_allocated_irq_routes = 0;
490 #endif
492 fd = kvm_ioctl(kvm_state, KVM_CREATE_VM, 0);
493 if (fd < 0) {
494 fprintf(stderr, "kvm_create_vm: %m\n");
495 return -1;
497 kvm_state->vmfd = fd;
498 return 0;
501 static int kvm_create_default_phys_mem(kvm_context_t kvm,
502 unsigned long phys_mem_bytes,
503 void **vm_mem)
505 #ifdef KVM_CAP_USER_MEMORY
506 int r = kvm_ioctl(kvm_state, KVM_CHECK_EXTENSION, KVM_CAP_USER_MEMORY);
507 if (r > 0)
508 return 0;
509 fprintf(stderr,
510 "Hypervisor too old: KVM_CAP_USER_MEMORY extension not supported\n");
511 #else
512 #error Hypervisor too old: KVM_CAP_USER_MEMORY extension not supported
513 #endif
514 return -1;
517 void kvm_create_irqchip(kvm_context_t kvm)
519 int r;
521 kvm->irqchip_in_kernel = 0;
522 #ifdef KVM_CAP_IRQCHIP
523 if (!kvm->no_irqchip_creation) {
524 r = kvm_ioctl(kvm_state, KVM_CHECK_EXTENSION, KVM_CAP_IRQCHIP);
525 if (r > 0) { /* kernel irqchip supported */
526 r = kvm_vm_ioctl(kvm_state, KVM_CREATE_IRQCHIP);
527 if (r >= 0) {
528 kvm->irqchip_inject_ioctl = KVM_IRQ_LINE;
529 #if defined(KVM_CAP_IRQ_INJECT_STATUS) && defined(KVM_IRQ_LINE_STATUS)
530 r = kvm_ioctl(kvm_state, KVM_CHECK_EXTENSION,
531 KVM_CAP_IRQ_INJECT_STATUS);
532 if (r > 0)
533 kvm->irqchip_inject_ioctl = KVM_IRQ_LINE_STATUS;
534 #endif
535 kvm->irqchip_in_kernel = 1;
536 } else
537 fprintf(stderr, "Create kernel PIC irqchip failed\n");
540 #endif
541 kvm_state->irqchip_in_kernel = kvm->irqchip_in_kernel;
544 int kvm_create(kvm_context_t kvm, unsigned long phys_mem_bytes, void **vm_mem)
546 int r;
548 r = kvm_create_vm(kvm);
549 if (r < 0)
550 return r;
551 r = kvm_arch_create(kvm, phys_mem_bytes, vm_mem);
552 if (r < 0)
553 return r;
554 init_slots();
555 r = kvm_create_default_phys_mem(kvm, phys_mem_bytes, vm_mem);
556 if (r < 0)
557 return r;
558 kvm_create_irqchip(kvm);
560 return 0;
564 int kvm_register_phys_mem(kvm_context_t kvm,
565 unsigned long phys_start, void *userspace_addr,
566 unsigned long len, int log)
569 struct kvm_userspace_memory_region memory = {
570 .memory_size = len,
571 .guest_phys_addr = phys_start,
572 .userspace_addr = (unsigned long) (uintptr_t) userspace_addr,
573 .flags = log ? KVM_MEM_LOG_DIRTY_PAGES : 0,
575 int r;
577 memory.slot = get_free_slot(kvm);
578 DPRINTF
579 ("memory: gpa: %llx, size: %llx, uaddr: %llx, slot: %x, flags: %x\n",
580 memory.guest_phys_addr, memory.memory_size, memory.userspace_addr,
581 memory.slot, memory.flags);
582 r = kvm_vm_ioctl(kvm_state, KVM_SET_USER_MEMORY_REGION, &memory);
583 if (r < 0) {
584 fprintf(stderr, "create_userspace_phys_mem: %s\n", strerror(-r));
585 return -1;
587 register_slot(memory.slot, memory.guest_phys_addr, memory.memory_size,
588 memory.userspace_addr, memory.flags);
589 return 0;
593 /* destroy/free a whole slot.
594 * phys_start, len and slot are the params passed to kvm_create_phys_mem()
596 void kvm_destroy_phys_mem(kvm_context_t kvm, unsigned long phys_start,
597 unsigned long len)
599 int slot;
600 int r;
601 struct kvm_userspace_memory_region memory = {
602 .memory_size = 0,
603 .guest_phys_addr = phys_start,
604 .userspace_addr = 0,
605 .flags = 0,
608 slot = get_slot(phys_start);
610 if ((slot >= KVM_MAX_NUM_MEM_REGIONS) || (slot == -1)) {
611 fprintf(stderr, "BUG: %s: invalid parameters (slot=%d)\n", __FUNCTION__,
612 slot);
613 return;
615 if (phys_start != slots[slot].phys_addr) {
616 fprintf(stderr,
617 "WARNING: %s: phys_start is 0x%lx expecting 0x%lx\n",
618 __FUNCTION__, phys_start, slots[slot].phys_addr);
619 phys_start = slots[slot].phys_addr;
622 memory.slot = slot;
623 DPRINTF("slot %d start %llx len %llx flags %x\n",
624 memory.slot, memory.guest_phys_addr, memory.memory_size,
625 memory.flags);
626 r = kvm_vm_ioctl(kvm_state, KVM_SET_USER_MEMORY_REGION, &memory);
627 if (r < 0) {
628 fprintf(stderr, "destroy_userspace_phys_mem: %s", strerror(-r));
629 return;
632 free_slot(memory.slot);
635 void kvm_unregister_memory_area(kvm_context_t kvm, uint64_t phys_addr,
636 unsigned long size)
639 int slot = get_container_slot(phys_addr, size);
641 if (slot != -1) {
642 DPRINTF("Unregistering memory region %" PRIx64 " (%lx)\n", phys_addr, size);
643 kvm_destroy_phys_mem(kvm, phys_addr, size);
644 return;
648 static int kvm_get_map(kvm_context_t kvm, int ioctl_num, int slot, void *buf)
650 int r;
651 struct kvm_dirty_log log = {
652 .slot = slot,
655 log.dirty_bitmap = buf;
657 r = kvm_vm_ioctl(kvm_state, ioctl_num, &log);
658 if (r < 0)
659 return r;
660 return 0;
663 int kvm_get_dirty_pages(kvm_context_t kvm, unsigned long phys_addr, void *buf)
665 int slot;
667 slot = get_slot(phys_addr);
668 return kvm_get_map(kvm, KVM_GET_DIRTY_LOG, slot, buf);
671 int kvm_get_dirty_pages_range(kvm_context_t kvm, unsigned long phys_addr,
672 unsigned long len, void *opaque,
673 int (*cb)(unsigned long start,
674 unsigned long len, void *bitmap,
675 void *opaque))
677 int i;
678 int r;
679 unsigned long end_addr = phys_addr + len;
680 void *buf;
682 for (i = 0; i < KVM_MAX_NUM_MEM_REGIONS; ++i) {
683 if ((slots[i].len && (uint64_t) slots[i].phys_addr >= phys_addr)
684 && ((uint64_t) slots[i].phys_addr + slots[i].len <= end_addr)) {
685 buf = qemu_malloc(BITMAP_SIZE(slots[i].len));
686 r = kvm_get_map(kvm, KVM_GET_DIRTY_LOG, i, buf);
687 if (r) {
688 qemu_free(buf);
689 return r;
691 r = cb(slots[i].phys_addr, slots[i].len, buf, opaque);
692 qemu_free(buf);
693 if (r)
694 return r;
697 return 0;
700 #ifdef KVM_CAP_IRQCHIP
702 int kvm_set_irq_level(kvm_context_t kvm, int irq, int level, int *status)
704 struct kvm_irq_level event;
705 int r;
707 if (!kvm->irqchip_in_kernel)
708 return 0;
709 event.level = level;
710 event.irq = irq;
711 r = kvm_vm_ioctl(kvm_state, kvm->irqchip_inject_ioctl, &event);
712 if (r < 0)
713 perror("kvm_set_irq_level");
715 if (status) {
716 #ifdef KVM_CAP_IRQ_INJECT_STATUS
717 *status =
718 (kvm->irqchip_inject_ioctl == KVM_IRQ_LINE) ? 1 : event.status;
719 #else
720 *status = 1;
721 #endif
724 return 1;
727 int kvm_get_irqchip(kvm_context_t kvm, struct kvm_irqchip *chip)
729 int r;
731 if (!kvm->irqchip_in_kernel)
732 return 0;
733 r = kvm_vm_ioctl(kvm_state, KVM_GET_IRQCHIP, chip);
734 if (r < 0) {
735 perror("kvm_get_irqchip\n");
737 return r;
740 int kvm_set_irqchip(kvm_context_t kvm, struct kvm_irqchip *chip)
742 int r;
744 if (!kvm->irqchip_in_kernel)
745 return 0;
746 r = kvm_vm_ioctl(kvm_state, KVM_SET_IRQCHIP, chip);
747 if (r < 0) {
748 perror("kvm_set_irqchip\n");
750 return r;
753 #endif
755 static int handle_debug(CPUState *env)
757 #ifdef KVM_CAP_SET_GUEST_DEBUG
758 struct kvm_run *run = env->kvm_run;
760 return kvm_debug(env, &run->debug.arch);
761 #else
762 return 0;
763 #endif
766 int kvm_get_regs(CPUState *env, struct kvm_regs *regs)
768 return kvm_vcpu_ioctl(env, KVM_GET_REGS, regs);
771 int kvm_set_regs(CPUState *env, struct kvm_regs *regs)
773 return kvm_vcpu_ioctl(env, KVM_SET_REGS, regs);
776 int kvm_get_fpu(CPUState *env, struct kvm_fpu *fpu)
778 return kvm_vcpu_ioctl(env, KVM_GET_FPU, fpu);
781 int kvm_set_fpu(CPUState *env, struct kvm_fpu *fpu)
783 return kvm_vcpu_ioctl(env, KVM_SET_FPU, fpu);
786 int kvm_get_sregs(CPUState *env, struct kvm_sregs *sregs)
788 return kvm_vcpu_ioctl(env, KVM_GET_SREGS, sregs);
791 int kvm_set_sregs(CPUState *env, struct kvm_sregs *sregs)
793 return kvm_vcpu_ioctl(env, KVM_SET_SREGS, sregs);
796 #ifdef KVM_CAP_MP_STATE
797 int kvm_get_mpstate(CPUState *env, struct kvm_mp_state *mp_state)
799 int r;
801 r = kvm_ioctl(kvm_state, KVM_CHECK_EXTENSION, KVM_CAP_MP_STATE);
802 if (r > 0)
803 return kvm_vcpu_ioctl(env, KVM_GET_MP_STATE, mp_state);
804 return -ENOSYS;
807 int kvm_set_mpstate(CPUState *env, struct kvm_mp_state *mp_state)
809 int r;
811 r = kvm_ioctl(kvm_state, KVM_CHECK_EXTENSION, KVM_CAP_MP_STATE);
812 if (r > 0)
813 return kvm_vcpu_ioctl(env, KVM_SET_MP_STATE, mp_state);
814 return -ENOSYS;
816 #endif
818 static int handle_mmio(CPUState *env)
820 unsigned long addr = env->kvm_run->mmio.phys_addr;
821 struct kvm_run *kvm_run = env->kvm_run;
822 void *data = kvm_run->mmio.data;
824 /* hack: Red Hat 7.1 generates these weird accesses. */
825 if ((addr > 0xa0000 - 4 && addr <= 0xa0000) && kvm_run->mmio.len == 3)
826 return 0;
828 cpu_physical_memory_rw(addr, data, kvm_run->mmio.len, kvm_run->mmio.is_write);
829 return 0;
832 int handle_io_window(kvm_context_t kvm)
834 return 1;
837 int handle_shutdown(kvm_context_t kvm, CPUState *env)
839 /* stop the current vcpu from going back to guest mode */
840 env->stopped = 1;
842 qemu_system_reset_request();
843 return 1;
846 static inline void push_nmi(kvm_context_t kvm)
848 #ifdef KVM_CAP_USER_NMI
849 kvm_arch_push_nmi(kvm->opaque);
850 #endif /* KVM_CAP_USER_NMI */
853 void post_kvm_run(kvm_context_t kvm, CPUState *env)
855 pthread_mutex_lock(&qemu_mutex);
856 kvm_arch_post_run(env, env->kvm_run);
857 cpu_single_env = env;
860 int pre_kvm_run(kvm_context_t kvm, CPUState *env)
862 kvm_arch_pre_run(env, env->kvm_run);
864 if (env->kvm_cpu_state.regs_modified) {
865 kvm_arch_put_registers(env);
866 env->kvm_cpu_state.regs_modified = 0;
869 pthread_mutex_unlock(&qemu_mutex);
870 return 0;
873 int kvm_is_ready_for_interrupt_injection(CPUState *env)
875 return env->kvm_run->ready_for_interrupt_injection;
878 static int kvm_handle_internal_error(kvm_context_t kvm,
879 CPUState *env,
880 struct kvm_run *run)
882 fprintf(stderr, "KVM internal error. Suberror: %d\n",
883 run->internal.suberror);
884 #ifdef KVM_CAP_INTERNAL_ERROR_DATA
885 if (kvm_check_extension(kvm_state, KVM_CAP_INTERNAL_ERROR_DATA)) {
886 int i;
888 for (i = 0; i < run->internal.ndata; ++i) {
889 fprintf(stderr, "extra data[%d]: %"PRIx64"\n",
890 i, (uint64_t)run->internal.data[i]);
893 #endif
894 kvm_show_regs(env);
895 if (run->internal.suberror == KVM_INTERNAL_ERROR_EMULATION)
896 fprintf(stderr, "emulation failure\n");
897 vm_stop(0);
898 return 1;
901 int kvm_run(CPUState *env)
903 int r;
904 kvm_context_t kvm = &env->kvm_state->kvm_context;
905 struct kvm_run *run = env->kvm_run;
906 int fd = env->kvm_fd;
908 again:
909 push_nmi(kvm);
910 #if !defined(__s390__)
911 if (!kvm->irqchip_in_kernel)
912 run->request_interrupt_window = kvm_arch_try_push_interrupts(env);
913 #endif
915 r = pre_kvm_run(kvm, env);
916 if (r)
917 return r;
918 r = ioctl(fd, KVM_RUN, 0);
920 if (r == -1 && errno != EINTR && errno != EAGAIN) {
921 r = -errno;
922 post_kvm_run(kvm, env);
923 fprintf(stderr, "kvm_run: %s\n", strerror(-r));
924 return r;
927 post_kvm_run(kvm, env);
929 #if defined(KVM_CAP_COALESCED_MMIO)
930 if (kvm_state->coalesced_mmio) {
931 struct kvm_coalesced_mmio_ring *ring =
932 (void *) run + kvm_state->coalesced_mmio * PAGE_SIZE;
933 while (ring->first != ring->last) {
934 cpu_physical_memory_rw(ring->coalesced_mmio[ring->first].phys_addr,
935 &ring->coalesced_mmio[ring->first].data[0],
936 ring->coalesced_mmio[ring->first].len, 1);
937 smp_wmb();
938 ring->first = (ring->first + 1) % KVM_COALESCED_MMIO_MAX;
941 #endif
943 #if !defined(__s390__)
944 if (r == -1) {
945 r = handle_io_window(kvm);
946 goto more;
948 #endif
949 if (1) {
950 switch (run->exit_reason) {
951 case KVM_EXIT_UNKNOWN:
952 r = handle_unhandled(run->hw.hardware_exit_reason);
953 break;
954 case KVM_EXIT_FAIL_ENTRY:
955 r = handle_unhandled(run->fail_entry.hardware_entry_failure_reason);
956 break;
957 case KVM_EXIT_EXCEPTION:
958 fprintf(stderr, "exception %d (%x)\n", run->ex.exception,
959 run->ex.error_code);
960 kvm_show_regs(env);
961 kvm_show_code(env);
962 abort();
963 break;
964 case KVM_EXIT_IO:
965 r = kvm_handle_io(run->io.port,
966 (uint8_t *)run + run->io.data_offset,
967 run->io.direction,
968 run->io.size,
969 run->io.count);
970 break;
971 case KVM_EXIT_DEBUG:
972 r = handle_debug(env);
973 break;
974 case KVM_EXIT_MMIO:
975 r = handle_mmio(env);
976 break;
977 case KVM_EXIT_HLT:
978 r = kvm_arch_halt(env);
979 break;
980 case KVM_EXIT_IRQ_WINDOW_OPEN:
981 break;
982 case KVM_EXIT_SHUTDOWN:
983 r = handle_shutdown(kvm, env);
984 break;
985 #if defined(__s390__)
986 case KVM_EXIT_S390_SIEIC:
987 r = kvm_s390_handle_intercept(kvm, env, run);
988 break;
989 case KVM_EXIT_S390_RESET:
990 r = kvm_s390_handle_reset(kvm, env, run);
991 break;
992 #endif
993 case KVM_EXIT_INTERNAL_ERROR:
994 r = kvm_handle_internal_error(kvm, env, run);
995 break;
996 default:
997 if (kvm_arch_run(env)) {
998 fprintf(stderr, "unhandled vm exit: 0x%x\n", run->exit_reason);
999 kvm_show_regs(env);
1000 abort();
1002 break;
1005 more:
1006 if (!r)
1007 goto again;
1008 return r;
1011 int kvm_inject_irq(CPUState *env, unsigned irq)
1013 struct kvm_interrupt intr;
1015 intr.irq = irq;
1016 return kvm_vcpu_ioctl(env, KVM_INTERRUPT, &intr);
1019 #ifdef KVM_CAP_SET_GUEST_DEBUG
1020 int kvm_set_guest_debug(CPUState *env, struct kvm_guest_debug *dbg)
1022 return kvm_vcpu_ioctl(env, KVM_SET_GUEST_DEBUG, dbg);
1024 #endif
1026 int kvm_set_signal_mask(CPUState *env, const sigset_t *sigset)
1028 struct kvm_signal_mask *sigmask;
1029 int r;
1031 if (!sigset) {
1032 return kvm_vcpu_ioctl(env, KVM_SET_SIGNAL_MASK, NULL);
1034 sigmask = qemu_malloc(sizeof(*sigmask) + sizeof(*sigset));
1036 sigmask->len = 8;
1037 memcpy(sigmask->sigset, sigset, sizeof(*sigset));
1038 r = kvm_vcpu_ioctl(env, KVM_SET_SIGNAL_MASK, sigmask);
1039 free(sigmask);
1040 return r;
1043 int kvm_pit_in_kernel(kvm_context_t kvm)
1045 return kvm->pit_in_kernel;
1048 int kvm_inject_nmi(CPUState *env)
1050 #ifdef KVM_CAP_USER_NMI
1051 return kvm_vcpu_ioctl(env, KVM_NMI);
1052 #else
1053 return -ENOSYS;
1054 #endif
1057 int kvm_init_coalesced_mmio(kvm_context_t kvm)
1059 int r = 0;
1060 kvm_state->coalesced_mmio = 0;
1061 #ifdef KVM_CAP_COALESCED_MMIO
1062 r = kvm_ioctl(kvm_state, KVM_CHECK_EXTENSION, KVM_CAP_COALESCED_MMIO);
1063 if (r > 0) {
1064 kvm_state->coalesced_mmio = r;
1065 return 0;
1067 #endif
1068 return r;
1071 #ifdef KVM_CAP_DEVICE_ASSIGNMENT
1072 int kvm_assign_pci_device(kvm_context_t kvm,
1073 struct kvm_assigned_pci_dev *assigned_dev)
1075 return kvm_vm_ioctl(kvm_state, KVM_ASSIGN_PCI_DEVICE, assigned_dev);
1078 static int kvm_old_assign_irq(kvm_context_t kvm,
1079 struct kvm_assigned_irq *assigned_irq)
1081 return kvm_vm_ioctl(kvm_state, KVM_ASSIGN_IRQ, assigned_irq);
1084 #ifdef KVM_CAP_ASSIGN_DEV_IRQ
1085 int kvm_assign_irq(kvm_context_t kvm, struct kvm_assigned_irq *assigned_irq)
1087 int ret;
1089 ret = kvm_ioctl(kvm_state, KVM_CHECK_EXTENSION, KVM_CAP_ASSIGN_DEV_IRQ);
1090 if (ret > 0) {
1091 return kvm_vm_ioctl(kvm_state, KVM_ASSIGN_DEV_IRQ, assigned_irq);
1094 return kvm_old_assign_irq(kvm, assigned_irq);
1097 int kvm_deassign_irq(kvm_context_t kvm, struct kvm_assigned_irq *assigned_irq)
1099 return kvm_vm_ioctl(kvm_state, KVM_DEASSIGN_DEV_IRQ, assigned_irq);
1101 #else
1102 int kvm_assign_irq(kvm_context_t kvm, struct kvm_assigned_irq *assigned_irq)
1104 return kvm_old_assign_irq(kvm, assigned_irq);
1106 #endif
1107 #endif
1109 #ifdef KVM_CAP_DEVICE_DEASSIGNMENT
1110 int kvm_deassign_pci_device(kvm_context_t kvm,
1111 struct kvm_assigned_pci_dev *assigned_dev)
1113 return kvm_vm_ioctl(kvm_state, KVM_DEASSIGN_PCI_DEVICE, assigned_dev);
1115 #endif
1117 int kvm_destroy_memory_region_works(kvm_context_t kvm)
1119 int ret = 0;
1121 #ifdef KVM_CAP_DESTROY_MEMORY_REGION_WORKS
1122 ret =
1123 kvm_ioctl(kvm_state, KVM_CHECK_EXTENSION,
1124 KVM_CAP_DESTROY_MEMORY_REGION_WORKS);
1125 if (ret <= 0)
1126 ret = 0;
1127 #endif
1128 return ret;
1131 int kvm_reinject_control(kvm_context_t kvm, int pit_reinject)
1133 #ifdef KVM_CAP_REINJECT_CONTROL
1134 int r;
1135 struct kvm_reinject_control control;
1137 control.pit_reinject = pit_reinject;
1139 r = kvm_ioctl(kvm_state, KVM_CHECK_EXTENSION, KVM_CAP_REINJECT_CONTROL);
1140 if (r > 0) {
1141 return kvm_vm_ioctl(kvm_state, KVM_REINJECT_CONTROL, &control);
1143 #endif
1144 return -ENOSYS;
1147 int kvm_has_gsi_routing(kvm_context_t kvm)
1149 int r = 0;
1151 #ifdef KVM_CAP_IRQ_ROUTING
1152 r = kvm_check_extension(kvm_state, KVM_CAP_IRQ_ROUTING);
1153 #endif
1154 return r;
1157 int kvm_get_gsi_count(kvm_context_t kvm)
1159 #ifdef KVM_CAP_IRQ_ROUTING
1160 return kvm_check_extension(kvm_state, KVM_CAP_IRQ_ROUTING);
1161 #else
1162 return -EINVAL;
1163 #endif
1166 int kvm_clear_gsi_routes(kvm_context_t kvm)
1168 #ifdef KVM_CAP_IRQ_ROUTING
1169 kvm->irq_routes->nr = 0;
1170 return 0;
1171 #else
1172 return -EINVAL;
1173 #endif
1176 int kvm_add_routing_entry(kvm_context_t kvm,
1177 struct kvm_irq_routing_entry *entry)
1179 #ifdef KVM_CAP_IRQ_ROUTING
1180 struct kvm_irq_routing *z;
1181 struct kvm_irq_routing_entry *new;
1182 int n, size;
1184 if (kvm->irq_routes->nr == kvm->nr_allocated_irq_routes) {
1185 n = kvm->nr_allocated_irq_routes * 2;
1186 if (n < 64)
1187 n = 64;
1188 size = sizeof(struct kvm_irq_routing);
1189 size += n * sizeof(*new);
1190 z = realloc(kvm->irq_routes, size);
1191 if (!z)
1192 return -ENOMEM;
1193 kvm->nr_allocated_irq_routes = n;
1194 kvm->irq_routes = z;
1196 n = kvm->irq_routes->nr++;
1197 new = &kvm->irq_routes->entries[n];
1198 memset(new, 0, sizeof(*new));
1199 new->gsi = entry->gsi;
1200 new->type = entry->type;
1201 new->flags = entry->flags;
1202 new->u = entry->u;
1204 set_gsi(kvm, entry->gsi);
1206 return 0;
1207 #else
1208 return -ENOSYS;
1209 #endif
1212 int kvm_add_irq_route(kvm_context_t kvm, int gsi, int irqchip, int pin)
1214 #ifdef KVM_CAP_IRQ_ROUTING
1215 struct kvm_irq_routing_entry e;
1217 e.gsi = gsi;
1218 e.type = KVM_IRQ_ROUTING_IRQCHIP;
1219 e.flags = 0;
1220 e.u.irqchip.irqchip = irqchip;
1221 e.u.irqchip.pin = pin;
1222 return kvm_add_routing_entry(kvm, &e);
1223 #else
1224 return -ENOSYS;
1225 #endif
1228 int kvm_del_routing_entry(kvm_context_t kvm,
1229 struct kvm_irq_routing_entry *entry)
1231 #ifdef KVM_CAP_IRQ_ROUTING
1232 struct kvm_irq_routing_entry *e, *p;
1233 int i, gsi, found = 0;
1235 gsi = entry->gsi;
1237 for (i = 0; i < kvm->irq_routes->nr; ++i) {
1238 e = &kvm->irq_routes->entries[i];
1239 if (e->type == entry->type && e->gsi == gsi) {
1240 switch (e->type) {
1241 case KVM_IRQ_ROUTING_IRQCHIP:{
1242 if (e->u.irqchip.irqchip ==
1243 entry->u.irqchip.irqchip
1244 && e->u.irqchip.pin == entry->u.irqchip.pin) {
1245 p = &kvm->irq_routes->entries[--kvm->irq_routes->nr];
1246 *e = *p;
1247 found = 1;
1249 break;
1251 case KVM_IRQ_ROUTING_MSI:{
1252 if (e->u.msi.address_lo ==
1253 entry->u.msi.address_lo
1254 && e->u.msi.address_hi ==
1255 entry->u.msi.address_hi
1256 && e->u.msi.data == entry->u.msi.data) {
1257 p = &kvm->irq_routes->entries[--kvm->irq_routes->nr];
1258 *e = *p;
1259 found = 1;
1261 break;
1263 default:
1264 break;
1266 if (found) {
1267 /* If there are no other users of this GSI
1268 * mark it available in the bitmap */
1269 for (i = 0; i < kvm->irq_routes->nr; i++) {
1270 e = &kvm->irq_routes->entries[i];
1271 if (e->gsi == gsi)
1272 break;
1274 if (i == kvm->irq_routes->nr)
1275 clear_gsi(kvm, gsi);
1277 return 0;
1281 return -ESRCH;
1282 #else
1283 return -ENOSYS;
1284 #endif
1287 int kvm_update_routing_entry(kvm_context_t kvm,
1288 struct kvm_irq_routing_entry *entry,
1289 struct kvm_irq_routing_entry *newentry)
1291 #ifdef KVM_CAP_IRQ_ROUTING
1292 struct kvm_irq_routing_entry *e;
1293 int i;
1295 if (entry->gsi != newentry->gsi || entry->type != newentry->type) {
1296 return -EINVAL;
1299 for (i = 0; i < kvm->irq_routes->nr; ++i) {
1300 e = &kvm->irq_routes->entries[i];
1301 if (e->type != entry->type || e->gsi != entry->gsi) {
1302 continue;
1304 switch (e->type) {
1305 case KVM_IRQ_ROUTING_IRQCHIP:
1306 if (e->u.irqchip.irqchip == entry->u.irqchip.irqchip &&
1307 e->u.irqchip.pin == entry->u.irqchip.pin) {
1308 memcpy(&e->u.irqchip, &newentry->u.irqchip,
1309 sizeof e->u.irqchip);
1310 return 0;
1312 break;
1313 case KVM_IRQ_ROUTING_MSI:
1314 if (e->u.msi.address_lo == entry->u.msi.address_lo &&
1315 e->u.msi.address_hi == entry->u.msi.address_hi &&
1316 e->u.msi.data == entry->u.msi.data) {
1317 memcpy(&e->u.msi, &newentry->u.msi, sizeof e->u.msi);
1318 return 0;
1320 break;
1321 default:
1322 break;
1325 return -ESRCH;
1326 #else
1327 return -ENOSYS;
1328 #endif
1331 int kvm_del_irq_route(kvm_context_t kvm, int gsi, int irqchip, int pin)
1333 #ifdef KVM_CAP_IRQ_ROUTING
1334 struct kvm_irq_routing_entry e;
1336 e.gsi = gsi;
1337 e.type = KVM_IRQ_ROUTING_IRQCHIP;
1338 e.flags = 0;
1339 e.u.irqchip.irqchip = irqchip;
1340 e.u.irqchip.pin = pin;
1341 return kvm_del_routing_entry(kvm, &e);
1342 #else
1343 return -ENOSYS;
1344 #endif
1347 int kvm_commit_irq_routes(kvm_context_t kvm)
1349 #ifdef KVM_CAP_IRQ_ROUTING
1350 kvm->irq_routes->flags = 0;
1351 return kvm_vm_ioctl(kvm_state, KVM_SET_GSI_ROUTING, kvm->irq_routes);
1352 #else
1353 return -ENOSYS;
1354 #endif
1357 int kvm_get_irq_route_gsi(kvm_context_t kvm)
1359 int i, bit;
1360 uint32_t *buf = kvm->used_gsi_bitmap;
1362 /* Return the lowest unused GSI in the bitmap */
1363 for (i = 0; i < kvm->max_gsi / 32; i++) {
1364 bit = ffs(~buf[i]);
1365 if (!bit)
1366 continue;
1368 return bit - 1 + i * 32;
1371 return -ENOSPC;
1374 #ifdef KVM_CAP_DEVICE_MSIX
1375 int kvm_assign_set_msix_nr(kvm_context_t kvm,
1376 struct kvm_assigned_msix_nr *msix_nr)
1378 return kvm_vm_ioctl(kvm_state, KVM_ASSIGN_SET_MSIX_NR, msix_nr);
1381 int kvm_assign_set_msix_entry(kvm_context_t kvm,
1382 struct kvm_assigned_msix_entry *entry)
1384 return kvm_vm_ioctl(kvm_state, KVM_ASSIGN_SET_MSIX_ENTRY, entry);
1386 #endif
1388 #if defined(KVM_CAP_IRQFD) && defined(CONFIG_EVENTFD)
1390 #include <sys/eventfd.h>
1392 static int _kvm_irqfd(kvm_context_t kvm, int fd, int gsi, int flags)
1394 struct kvm_irqfd data = {
1395 .fd = fd,
1396 .gsi = gsi,
1397 .flags = flags,
1400 return kvm_vm_ioctl(kvm_state, KVM_IRQFD, &data);
1403 int kvm_irqfd(kvm_context_t kvm, int gsi, int flags)
1405 int r;
1406 int fd;
1408 if (!kvm_check_extension(kvm_state, KVM_CAP_IRQFD))
1409 return -ENOENT;
1411 fd = eventfd(0, 0);
1412 if (fd < 0)
1413 return -errno;
1415 r = _kvm_irqfd(kvm, fd, gsi, 0);
1416 if (r < 0) {
1417 close(fd);
1418 return -errno;
1421 return fd;
1424 #else /* KVM_CAP_IRQFD */
1426 int kvm_irqfd(kvm_context_t kvm, int gsi, int flags)
1428 return -ENOSYS;
1431 #endif /* KVM_CAP_IRQFD */
1432 static inline unsigned long kvm_get_thread_id(void)
1434 return syscall(SYS_gettid);
1437 static void qemu_cond_wait(pthread_cond_t *cond)
1439 CPUState *env = cpu_single_env;
1441 pthread_cond_wait(cond, &qemu_mutex);
1442 cpu_single_env = env;
1445 static void sig_ipi_handler(int n)
1449 static void hardware_memory_error(void)
1451 fprintf(stderr, "Hardware memory error!\n");
1452 exit(1);
1455 static void sigbus_reraise(void)
1457 sigset_t set;
1458 struct sigaction action;
1460 memset(&action, 0, sizeof(action));
1461 action.sa_handler = SIG_DFL;
1462 if (!sigaction(SIGBUS, &action, NULL)) {
1463 raise(SIGBUS);
1464 sigemptyset(&set);
1465 sigaddset(&set, SIGBUS);
1466 sigprocmask(SIG_UNBLOCK, &set, NULL);
1468 perror("Failed to re-raise SIGBUS!\n");
1469 abort();
1472 static void sigbus_handler(int n, struct qemu_signalfd_siginfo *siginfo,
1473 void *ctx)
1475 #if defined(KVM_CAP_MCE) && defined(TARGET_I386)
1476 if (first_cpu->mcg_cap && siginfo->ssi_addr
1477 && siginfo->ssi_code == BUS_MCEERR_AO) {
1478 uint64_t status;
1479 unsigned long paddr;
1480 CPUState *cenv;
1482 /* Hope we are lucky for AO MCE */
1483 if (do_qemu_ram_addr_from_host((void *)(intptr_t)siginfo->ssi_addr,
1484 &paddr)) {
1485 fprintf(stderr, "Hardware memory error for memory used by "
1486 "QEMU itself instead of guest system!: %llx\n",
1487 (unsigned long long)siginfo->ssi_addr);
1488 return;
1490 status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN
1491 | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S
1492 | 0xc0;
1493 kvm_inject_x86_mce(first_cpu, 9, status,
1494 MCG_STATUS_MCIP | MCG_STATUS_RIPV, paddr,
1495 (MCM_ADDR_PHYS << 6) | 0xc, 1);
1496 for (cenv = first_cpu->next_cpu; cenv != NULL; cenv = cenv->next_cpu)
1497 kvm_inject_x86_mce(cenv, 1, MCI_STATUS_VAL | MCI_STATUS_UC,
1498 MCG_STATUS_MCIP | MCG_STATUS_RIPV, 0, 0, 1);
1499 } else
1500 #endif
1502 if (siginfo->ssi_code == BUS_MCEERR_AO)
1503 return;
1504 else if (siginfo->ssi_code == BUS_MCEERR_AR)
1505 hardware_memory_error();
1506 else
1507 sigbus_reraise();
1511 static void on_vcpu(CPUState *env, void (*func)(void *data), void *data)
1513 struct qemu_work_item wi;
1515 if (env == current_env) {
1516 func(data);
1517 return;
1520 wi.func = func;
1521 wi.data = data;
1522 if (!env->kvm_cpu_state.queued_work_first)
1523 env->kvm_cpu_state.queued_work_first = &wi;
1524 else
1525 env->kvm_cpu_state.queued_work_last->next = &wi;
1526 env->kvm_cpu_state.queued_work_last = &wi;
1527 wi.next = NULL;
1528 wi.done = false;
1530 pthread_kill(env->kvm_cpu_state.thread, SIG_IPI);
1531 while (!wi.done)
1532 qemu_cond_wait(&qemu_work_cond);
1535 void kvm_arch_get_registers(CPUState *env)
1537 kvm_arch_save_regs(env);
1540 static void do_kvm_cpu_synchronize_state(void *_env)
1542 CPUState *env = _env;
1543 if (!env->kvm_cpu_state.regs_modified) {
1544 kvm_arch_get_registers(env);
1545 env->kvm_cpu_state.regs_modified = 1;
1549 void kvm_cpu_synchronize_state(CPUState *env)
1551 if (!env->kvm_cpu_state.regs_modified)
1552 on_vcpu(env, do_kvm_cpu_synchronize_state, env);
1555 static void inject_interrupt(void *data)
1557 cpu_interrupt(current_env, (long) data);
1560 void kvm_inject_interrupt(CPUState *env, int mask)
1562 on_vcpu(env, inject_interrupt, (void *) (long) mask);
1565 void kvm_update_interrupt_request(CPUState *env)
1567 int signal = 0;
1569 if (env) {
1570 if (!current_env || !current_env->created)
1571 signal = 1;
1573 * Testing for created here is really redundant
1575 if (current_env && current_env->created &&
1576 env != current_env && !env->kvm_cpu_state.signalled)
1577 signal = 1;
1579 if (signal) {
1580 env->kvm_cpu_state.signalled = 1;
1581 if (env->kvm_cpu_state.thread)
1582 pthread_kill(env->kvm_cpu_state.thread, SIG_IPI);
1587 static void kvm_do_load_registers(void *_env)
1589 CPUState *env = _env;
1591 kvm_arch_load_regs(env);
1594 void kvm_load_registers(CPUState *env)
1596 if (kvm_enabled() && qemu_system_ready)
1597 on_vcpu(env, kvm_do_load_registers, env);
1600 static void kvm_do_save_registers(void *_env)
1602 CPUState *env = _env;
1604 kvm_arch_save_regs(env);
1607 void kvm_save_registers(CPUState *env)
1609 if (kvm_enabled())
1610 on_vcpu(env, kvm_do_save_registers, env);
1613 static void kvm_do_load_mpstate(void *_env)
1615 CPUState *env = _env;
1617 kvm_arch_load_mpstate(env);
1620 void kvm_load_mpstate(CPUState *env)
1622 if (kvm_enabled() && qemu_system_ready && kvm_vcpu_inited(env))
1623 on_vcpu(env, kvm_do_load_mpstate, env);
1626 static void kvm_do_save_mpstate(void *_env)
1628 CPUState *env = _env;
1630 kvm_arch_save_mpstate(env);
1631 #ifdef KVM_CAP_MP_STATE
1632 if (kvm_irqchip_in_kernel())
1633 env->halted = (env->mp_state == KVM_MP_STATE_HALTED);
1634 #endif
1637 void kvm_save_mpstate(CPUState *env)
1639 if (kvm_enabled())
1640 on_vcpu(env, kvm_do_save_mpstate, env);
1643 int kvm_cpu_exec(CPUState *env)
1645 int r;
1647 r = kvm_run(env);
1648 if (r < 0) {
1649 printf("kvm_run returned %d\n", r);
1650 vm_stop(0);
1653 return 0;
1656 static int is_cpu_stopped(CPUState *env)
1658 return !vm_running || env->stopped;
1661 static void flush_queued_work(CPUState *env)
1663 struct qemu_work_item *wi;
1665 if (!env->kvm_cpu_state.queued_work_first)
1666 return;
1668 while ((wi = env->kvm_cpu_state.queued_work_first)) {
1669 env->kvm_cpu_state.queued_work_first = wi->next;
1670 wi->func(wi->data);
1671 wi->done = true;
1673 env->kvm_cpu_state.queued_work_last = NULL;
1674 pthread_cond_broadcast(&qemu_work_cond);
1677 static void kvm_on_sigbus(CPUState *env, siginfo_t *siginfo)
1679 #if defined(KVM_CAP_MCE) && defined(TARGET_I386)
1680 struct kvm_x86_mce mce = {
1681 .bank = 9,
1683 unsigned long paddr;
1684 int r;
1686 if (env->mcg_cap && siginfo->si_addr
1687 && (siginfo->si_code == BUS_MCEERR_AR
1688 || siginfo->si_code == BUS_MCEERR_AO)) {
1689 if (siginfo->si_code == BUS_MCEERR_AR) {
1690 /* Fake an Intel architectural Data Load SRAR UCR */
1691 mce.status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN
1692 | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S
1693 | MCI_STATUS_AR | 0x134;
1694 mce.misc = (MCM_ADDR_PHYS << 6) | 0xc;
1695 mce.mcg_status = MCG_STATUS_MCIP | MCG_STATUS_EIPV;
1696 } else {
1697 /* Fake an Intel architectural Memory scrubbing UCR */
1698 mce.status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN
1699 | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S
1700 | 0xc0;
1701 mce.misc = (MCM_ADDR_PHYS << 6) | 0xc;
1702 mce.mcg_status = MCG_STATUS_MCIP | MCG_STATUS_RIPV;
1704 if (do_qemu_ram_addr_from_host((void *)siginfo->si_addr, &paddr)) {
1705 fprintf(stderr, "Hardware memory error for memory used by "
1706 "QEMU itself instaed of guest system!\n");
1707 /* Hope we are lucky for AO MCE */
1708 if (siginfo->si_code == BUS_MCEERR_AO)
1709 return;
1710 else
1711 hardware_memory_error();
1713 mce.addr = paddr;
1714 r = kvm_set_mce(env, &mce);
1715 if (r < 0) {
1716 fprintf(stderr, "kvm_set_mce: %s\n", strerror(errno));
1717 abort();
1719 } else
1720 #endif
1722 if (siginfo->si_code == BUS_MCEERR_AO)
1723 return;
1724 else if (siginfo->si_code == BUS_MCEERR_AR)
1725 hardware_memory_error();
1726 else
1727 sigbus_reraise();
1731 static void kvm_main_loop_wait(CPUState *env, int timeout)
1733 struct timespec ts;
1734 int r, e;
1735 siginfo_t siginfo;
1736 sigset_t waitset;
1737 sigset_t chkset;
1739 ts.tv_sec = timeout / 1000;
1740 ts.tv_nsec = (timeout % 1000) * 1000000;
1741 sigemptyset(&waitset);
1742 sigaddset(&waitset, SIG_IPI);
1743 sigaddset(&waitset, SIGBUS);
1745 do {
1746 pthread_mutex_unlock(&qemu_mutex);
1748 r = sigtimedwait(&waitset, &siginfo, &ts);
1749 e = errno;
1751 pthread_mutex_lock(&qemu_mutex);
1753 if (r == -1 && !(e == EAGAIN || e == EINTR)) {
1754 printf("sigtimedwait: %s\n", strerror(e));
1755 exit(1);
1758 switch (r) {
1759 case SIGBUS:
1760 kvm_on_sigbus(env, &siginfo);
1761 break;
1762 default:
1763 break;
1766 r = sigpending(&chkset);
1767 if (r == -1) {
1768 printf("sigpending: %s\n", strerror(e));
1769 exit(1);
1771 } while (sigismember(&chkset, SIG_IPI) || sigismember(&chkset, SIGBUS));
1773 cpu_single_env = env;
1774 flush_queued_work(env);
1776 if (env->stop) {
1777 env->stop = 0;
1778 env->stopped = 1;
1779 pthread_cond_signal(&qemu_pause_cond);
1782 env->kvm_cpu_state.signalled = 0;
1785 static int all_threads_paused(void)
1787 CPUState *penv = first_cpu;
1789 while (penv) {
1790 if (penv->stop)
1791 return 0;
1792 penv = (CPUState *) penv->next_cpu;
1795 return 1;
1798 static void pause_all_threads(void)
1800 CPUState *penv = first_cpu;
1802 while (penv) {
1803 if (penv != cpu_single_env) {
1804 penv->stop = 1;
1805 pthread_kill(penv->kvm_cpu_state.thread, SIG_IPI);
1806 } else {
1807 penv->stop = 0;
1808 penv->stopped = 1;
1809 cpu_exit(penv);
1811 penv = (CPUState *) penv->next_cpu;
1814 while (!all_threads_paused())
1815 qemu_cond_wait(&qemu_pause_cond);
1818 static void resume_all_threads(void)
1820 CPUState *penv = first_cpu;
1822 assert(!cpu_single_env);
1824 while (penv) {
1825 penv->stop = 0;
1826 penv->stopped = 0;
1827 pthread_kill(penv->kvm_cpu_state.thread, SIG_IPI);
1828 penv = (CPUState *) penv->next_cpu;
1832 static void kvm_vm_state_change_handler(void *context, int running, int reason)
1834 if (running)
1835 resume_all_threads();
1836 else
1837 pause_all_threads();
1840 static void setup_kernel_sigmask(CPUState *env)
1842 sigset_t set;
1844 sigemptyset(&set);
1845 sigaddset(&set, SIGUSR2);
1846 sigaddset(&set, SIGIO);
1847 sigaddset(&set, SIGALRM);
1848 sigprocmask(SIG_BLOCK, &set, NULL);
1850 sigprocmask(SIG_BLOCK, NULL, &set);
1851 sigdelset(&set, SIG_IPI);
1852 sigdelset(&set, SIGBUS);
1854 kvm_set_signal_mask(env, &set);
1857 static void qemu_kvm_system_reset(void)
1859 CPUState *penv = first_cpu;
1861 pause_all_threads();
1863 qemu_system_reset();
1865 while (penv) {
1866 kvm_arch_cpu_reset(penv);
1867 penv = (CPUState *) penv->next_cpu;
1870 resume_all_threads();
1873 static void process_irqchip_events(CPUState *env)
1875 kvm_arch_process_irqchip_events(env);
1876 if (kvm_arch_has_work(env))
1877 env->halted = 0;
1880 static int kvm_main_loop_cpu(CPUState *env)
1882 while (1) {
1883 int run_cpu = !is_cpu_stopped(env);
1884 if (run_cpu && !kvm_irqchip_in_kernel()) {
1885 process_irqchip_events(env);
1886 run_cpu = !env->halted;
1888 if (run_cpu) {
1889 kvm_cpu_exec(env);
1890 kvm_main_loop_wait(env, 0);
1891 } else {
1892 kvm_main_loop_wait(env, 1000);
1895 pthread_mutex_unlock(&qemu_mutex);
1896 return 0;
1899 static void *ap_main_loop(void *_env)
1901 CPUState *env = _env;
1902 sigset_t signals;
1903 #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
1904 struct ioperm_data *data = NULL;
1905 #endif
1907 current_env = env;
1908 env->thread_id = kvm_get_thread_id();
1909 sigfillset(&signals);
1910 sigprocmask(SIG_BLOCK, &signals, NULL);
1911 kvm_create_vcpu(env, env->cpu_index);
1913 #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
1914 /* do ioperm for io ports of assigned devices */
1915 QLIST_FOREACH(data, &ioperm_head, entries)
1916 on_vcpu(env, kvm_arch_do_ioperm, data);
1917 #endif
1919 setup_kernel_sigmask(env);
1921 pthread_mutex_lock(&qemu_mutex);
1922 cpu_single_env = env;
1924 kvm_arch_init_vcpu(env);
1926 kvm_arch_load_regs(env);
1928 /* signal VCPU creation */
1929 current_env->created = 1;
1930 pthread_cond_signal(&qemu_vcpu_cond);
1932 /* and wait for machine initialization */
1933 while (!qemu_system_ready)
1934 qemu_cond_wait(&qemu_system_cond);
1936 /* re-initialize cpu_single_env after re-acquiring qemu_mutex */
1937 cpu_single_env = env;
1939 kvm_main_loop_cpu(env);
1940 return NULL;
1943 void kvm_init_vcpu(CPUState *env)
1945 pthread_create(&env->kvm_cpu_state.thread, NULL, ap_main_loop, env);
1947 while (env->created == 0)
1948 qemu_cond_wait(&qemu_vcpu_cond);
1951 int kvm_vcpu_inited(CPUState *env)
1953 return env->created;
1956 #ifdef TARGET_I386
1957 void kvm_hpet_disable_kpit(void)
1959 struct kvm_pit_state2 ps2;
1961 kvm_get_pit2(kvm_context, &ps2);
1962 ps2.flags |= KVM_PIT_FLAGS_HPET_LEGACY;
1963 kvm_set_pit2(kvm_context, &ps2);
1966 void kvm_hpet_enable_kpit(void)
1968 struct kvm_pit_state2 ps2;
1970 kvm_get_pit2(kvm_context, &ps2);
1971 ps2.flags &= ~KVM_PIT_FLAGS_HPET_LEGACY;
1972 kvm_set_pit2(kvm_context, &ps2);
1974 #endif
1976 int kvm_init_ap(void)
1978 struct sigaction action;
1980 qemu_add_vm_change_state_handler(kvm_vm_state_change_handler, NULL);
1982 signal(SIG_IPI, sig_ipi_handler);
1984 memset(&action, 0, sizeof(action));
1985 action.sa_flags = SA_SIGINFO;
1986 action.sa_sigaction = (void (*)(int, siginfo_t*, void*))sigbus_handler;
1987 sigaction(SIGBUS, &action, NULL);
1988 prctl(PR_MCE_KILL, 1, 1, 0, 0);
1989 return 0;
1992 void qemu_kvm_notify_work(void)
1994 uint64_t value = 1;
1995 char buffer[8];
1996 size_t offset = 0;
1998 if (io_thread_fd == -1)
1999 return;
2001 memcpy(buffer, &value, sizeof(value));
2003 while (offset < 8) {
2004 ssize_t len;
2006 len = write(io_thread_fd, buffer + offset, 8 - offset);
2007 if (len == -1 && errno == EINTR)
2008 continue;
2010 /* In case we have a pipe, there is not reason to insist writing
2011 * 8 bytes
2013 if (len == -1 && errno == EAGAIN)
2014 break;
2016 if (len <= 0)
2017 break;
2019 offset += len;
2023 /* If we have signalfd, we mask out the signals we want to handle and then
2024 * use signalfd to listen for them. We rely on whatever the current signal
2025 * handler is to dispatch the signals when we receive them.
2028 static void sigfd_handler(void *opaque)
2030 int fd = (unsigned long) opaque;
2031 struct qemu_signalfd_siginfo info;
2032 struct sigaction action;
2033 ssize_t len;
2035 while (1) {
2036 do {
2037 len = read(fd, &info, sizeof(info));
2038 } while (len == -1 && errno == EINTR);
2040 if (len == -1 && errno == EAGAIN)
2041 break;
2043 if (len != sizeof(info)) {
2044 printf("read from sigfd returned %zd: %m\n", len);
2045 return;
2048 sigaction(info.ssi_signo, NULL, &action);
2049 if ((action.sa_flags & SA_SIGINFO) && action.sa_sigaction)
2050 action.sa_sigaction(info.ssi_signo,
2051 (siginfo_t *)&info, NULL);
2052 else if (action.sa_handler)
2053 action.sa_handler(info.ssi_signo);
2058 /* Used to break IO thread out of select */
2059 static void io_thread_wakeup(void *opaque)
2061 int fd = (unsigned long) opaque;
2062 char buffer[4096];
2064 /* Drain the pipe/(eventfd) */
2065 while (1) {
2066 ssize_t len;
2068 len = read(fd, buffer, sizeof(buffer));
2069 if (len == -1 && errno == EINTR)
2070 continue;
2072 if (len <= 0)
2073 break;
2077 int kvm_main_loop(void)
2079 int fds[2];
2080 sigset_t mask;
2081 int sigfd;
2083 io_thread = pthread_self();
2084 qemu_system_ready = 1;
2086 if (qemu_eventfd(fds) == -1) {
2087 fprintf(stderr, "failed to create eventfd\n");
2088 return -errno;
2091 fcntl(fds[0], F_SETFL, O_NONBLOCK);
2092 fcntl(fds[1], F_SETFL, O_NONBLOCK);
2094 qemu_set_fd_handler2(fds[0], NULL, io_thread_wakeup, NULL,
2095 (void *)(unsigned long) fds[0]);
2097 io_thread_fd = fds[1];
2099 sigemptyset(&mask);
2100 sigaddset(&mask, SIGIO);
2101 sigaddset(&mask, SIGALRM);
2102 sigaddset(&mask, SIGBUS);
2103 sigprocmask(SIG_BLOCK, &mask, NULL);
2105 sigfd = qemu_signalfd(&mask);
2106 if (sigfd == -1) {
2107 fprintf(stderr, "failed to create signalfd\n");
2108 return -errno;
2111 fcntl(sigfd, F_SETFL, O_NONBLOCK);
2113 qemu_set_fd_handler2(sigfd, NULL, sigfd_handler, NULL,
2114 (void *)(unsigned long) sigfd);
2116 pthread_cond_broadcast(&qemu_system_cond);
2118 io_thread_sigfd = sigfd;
2119 cpu_single_env = NULL;
2121 while (1) {
2122 main_loop_wait(1000);
2123 if (qemu_shutdown_requested()) {
2124 if (qemu_no_shutdown()) {
2125 vm_stop(0);
2126 } else
2127 break;
2128 } else if (qemu_powerdown_requested()) {
2129 monitor_protocol_event(QEVENT_POWERDOWN, NULL);
2130 qemu_irq_raise(qemu_system_powerdown);
2131 } else if (qemu_reset_requested()) {
2132 monitor_protocol_event(QEVENT_RESET, NULL);
2133 qemu_kvm_system_reset();
2134 } else if (kvm_debug_cpu_requested) {
2135 monitor_protocol_event(QEVENT_DEBUG, NULL);
2136 gdb_set_stop_cpu(kvm_debug_cpu_requested);
2137 vm_stop(EXCP_DEBUG);
2138 kvm_debug_cpu_requested = NULL;
2142 pause_all_threads();
2143 pthread_mutex_unlock(&qemu_mutex);
2145 return 0;
2148 #ifdef TARGET_I386
2149 static int destroy_region_works = 0;
2150 #endif
2153 #if !defined(TARGET_I386)
2154 int kvm_arch_init_irq_routing(void)
2156 return 0;
2158 #endif
2160 extern int no_hpet;
2162 static int kvm_create_context(void)
2164 int r;
2166 if (!kvm_irqchip) {
2167 kvm_disable_irqchip_creation(kvm_context);
2169 if (!kvm_pit) {
2170 kvm_disable_pit_creation(kvm_context);
2172 if (kvm_create(kvm_context, 0, NULL) < 0) {
2173 kvm_finalize(kvm_state);
2174 return -1;
2176 r = kvm_arch_qemu_create_context();
2177 if (r < 0)
2178 kvm_finalize(kvm_state);
2179 if (kvm_pit && !kvm_pit_reinject) {
2180 if (kvm_reinject_control(kvm_context, 0)) {
2181 fprintf(stderr, "failure to disable in-kernel PIT reinjection\n");
2182 return -1;
2185 #ifdef TARGET_I386
2186 destroy_region_works = kvm_destroy_memory_region_works(kvm_context);
2187 #endif
2189 r = kvm_arch_init_irq_routing();
2190 if (r < 0) {
2191 return r;
2194 kvm_state->vcpu_events = 0;
2195 #ifdef KVM_CAP_VCPU_EVENTS
2196 kvm_state->vcpu_events = kvm_check_extension(kvm_state, KVM_CAP_VCPU_EVENTS);
2197 #endif
2199 kvm_init_ap();
2200 if (kvm_irqchip) {
2201 if (!qemu_kvm_has_gsi_routing()) {
2202 irq0override = 0;
2203 #ifdef TARGET_I386
2204 /* if kernel can't do irq routing, interrupt source
2205 * override 0->2 can not be set up as required by hpet,
2206 * so disable hpet.
2208 no_hpet = 1;
2209 } else if (!qemu_kvm_has_pit_state2()) {
2210 no_hpet = 1;
2212 #else
2214 #endif
2217 return 0;
2220 #ifdef TARGET_I386
2221 static int must_use_aliases_source(target_phys_addr_t addr)
2223 if (destroy_region_works)
2224 return false;
2225 if (addr == 0xa0000 || addr == 0xa8000)
2226 return true;
2227 return false;
2230 static int must_use_aliases_target(target_phys_addr_t addr)
2232 if (destroy_region_works)
2233 return false;
2234 if (addr >= 0xe0000000 && addr < 0x100000000ull)
2235 return true;
2236 return false;
2239 static struct mapping {
2240 target_phys_addr_t phys;
2241 ram_addr_t ram;
2242 ram_addr_t len;
2243 } mappings[50];
2244 static int nr_mappings;
2246 static struct mapping *find_ram_mapping(ram_addr_t ram_addr)
2248 struct mapping *p;
2250 for (p = mappings; p < mappings + nr_mappings; ++p) {
2251 if (p->ram <= ram_addr && ram_addr < p->ram + p->len) {
2252 return p;
2255 return NULL;
2258 static struct mapping *find_mapping(target_phys_addr_t start_addr)
2260 struct mapping *p;
2262 for (p = mappings; p < mappings + nr_mappings; ++p) {
2263 if (p->phys <= start_addr && start_addr < p->phys + p->len) {
2264 return p;
2267 return NULL;
2270 static void drop_mapping(target_phys_addr_t start_addr)
2272 struct mapping *p = find_mapping(start_addr);
2274 if (p)
2275 *p = mappings[--nr_mappings];
2277 #endif
2279 void kvm_set_phys_mem(target_phys_addr_t start_addr, ram_addr_t size,
2280 ram_addr_t phys_offset)
2282 int r = 0;
2283 unsigned long area_flags;
2284 #ifdef TARGET_I386
2285 struct mapping *p;
2286 #endif
2288 if (start_addr + size > phys_ram_size) {
2289 phys_ram_size = start_addr + size;
2292 phys_offset &= ~IO_MEM_ROM;
2293 area_flags = phys_offset & ~TARGET_PAGE_MASK;
2295 if (area_flags != IO_MEM_RAM) {
2296 #ifdef TARGET_I386
2297 if (must_use_aliases_source(start_addr)) {
2298 kvm_destroy_memory_alias(kvm_context, start_addr);
2299 return;
2301 if (must_use_aliases_target(start_addr))
2302 return;
2303 #endif
2304 while (size > 0) {
2305 p = find_mapping(start_addr);
2306 if (p) {
2307 kvm_unregister_memory_area(kvm_context, p->phys, p->len);
2308 drop_mapping(p->phys);
2310 start_addr += TARGET_PAGE_SIZE;
2311 if (size > TARGET_PAGE_SIZE) {
2312 size -= TARGET_PAGE_SIZE;
2313 } else {
2314 size = 0;
2317 return;
2320 r = kvm_is_containing_region(kvm_context, start_addr, size);
2321 if (r)
2322 return;
2324 if (area_flags >= TLB_MMIO)
2325 return;
2327 #ifdef TARGET_I386
2328 if (must_use_aliases_source(start_addr)) {
2329 p = find_ram_mapping(phys_offset);
2330 if (p) {
2331 kvm_create_memory_alias(kvm_context, start_addr, size,
2332 p->phys + (phys_offset - p->ram));
2334 return;
2336 #endif
2338 r = kvm_register_phys_mem(kvm_context, start_addr,
2339 qemu_get_ram_ptr(phys_offset), size, 0);
2340 if (r < 0) {
2341 printf("kvm_cpu_register_physical_memory: failed\n");
2342 exit(1);
2344 #ifdef TARGET_I386
2345 drop_mapping(start_addr);
2346 p = &mappings[nr_mappings++];
2347 p->phys = start_addr;
2348 p->ram = phys_offset;
2349 p->len = size;
2350 #endif
2352 return;
2355 int kvm_setup_guest_memory(void *area, unsigned long size)
2357 int ret = 0;
2359 #ifdef MADV_DONTFORK
2360 if (kvm_enabled() && !kvm_has_sync_mmu())
2361 ret = madvise(area, size, MADV_DONTFORK);
2362 #endif
2364 if (ret)
2365 perror("madvise");
2367 return ret;
2370 #ifdef KVM_CAP_SET_GUEST_DEBUG
2372 struct kvm_set_guest_debug_data {
2373 struct kvm_guest_debug dbg;
2374 int err;
2377 static void kvm_invoke_set_guest_debug(void *data)
2379 struct kvm_set_guest_debug_data *dbg_data = data;
2381 if (cpu_single_env->kvm_cpu_state.regs_modified) {
2382 kvm_arch_put_registers(cpu_single_env);
2383 cpu_single_env->kvm_cpu_state.regs_modified = 0;
2385 dbg_data->err =
2386 kvm_set_guest_debug(cpu_single_env,
2387 &dbg_data->dbg);
2390 int kvm_update_guest_debug(CPUState *env, unsigned long reinject_trap)
2392 struct kvm_set_guest_debug_data data;
2394 data.dbg.control = 0;
2395 if (env->singlestep_enabled)
2396 data.dbg.control = KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP;
2398 kvm_arch_update_guest_debug(env, &data.dbg);
2399 data.dbg.control |= reinject_trap;
2401 on_vcpu(env, kvm_invoke_set_guest_debug, &data);
2402 return data.err;
2405 #endif
2408 * dirty pages logging
2410 /* FIXME: use unsigned long pointer instead of unsigned char */
2411 unsigned char *kvm_dirty_bitmap = NULL;
2412 int kvm_physical_memory_set_dirty_tracking(int enable)
2414 int r = 0;
2416 if (!kvm_enabled())
2417 return 0;
2419 if (enable) {
2420 if (!kvm_dirty_bitmap) {
2421 unsigned bitmap_size = BITMAP_SIZE(phys_ram_size);
2422 kvm_dirty_bitmap = qemu_malloc(bitmap_size);
2423 r = kvm_dirty_pages_log_enable_all(kvm_context);
2425 } else {
2426 if (kvm_dirty_bitmap) {
2427 r = kvm_dirty_pages_log_reset(kvm_context);
2428 qemu_free(kvm_dirty_bitmap);
2429 kvm_dirty_bitmap = NULL;
2432 return r;
2435 /* get kvm's dirty pages bitmap and update qemu's */
2436 static int kvm_get_dirty_pages_log_range(unsigned long start_addr,
2437 unsigned char *bitmap,
2438 unsigned long offset,
2439 unsigned long mem_size)
2441 unsigned int i, j, n = 0;
2442 unsigned char c;
2443 unsigned long page_number, addr, addr1;
2444 ram_addr_t ram_addr;
2445 unsigned int len = ((mem_size / TARGET_PAGE_SIZE) + 7) / 8;
2448 * bitmap-traveling is faster than memory-traveling (for addr...)
2449 * especially when most of the memory is not dirty.
2451 for (i = 0; i < len; i++) {
2452 c = bitmap[i];
2453 while (c > 0) {
2454 j = ffsl(c) - 1;
2455 c &= ~(1u << j);
2456 page_number = i * 8 + j;
2457 addr1 = page_number * TARGET_PAGE_SIZE;
2458 addr = offset + addr1;
2459 ram_addr = cpu_get_physical_page_desc(addr);
2460 cpu_physical_memory_set_dirty(ram_addr);
2461 n++;
2464 return 0;
2467 static int kvm_get_dirty_bitmap_cb(unsigned long start, unsigned long len,
2468 void *bitmap, void *opaque)
2470 return kvm_get_dirty_pages_log_range(start, bitmap, start, len);
2474 * get kvm's dirty pages bitmap and update qemu's
2475 * we only care about physical ram, which resides in slots 0 and 3
2477 int kvm_update_dirty_pages_log(void)
2479 int r = 0;
2482 r = kvm_get_dirty_pages_range(kvm_context, 0, -1UL, NULL,
2483 kvm_get_dirty_bitmap_cb);
2484 return r;
2487 void kvm_qemu_log_memory(target_phys_addr_t start, target_phys_addr_t size,
2488 int log)
2490 if (log)
2491 kvm_dirty_pages_log_enable_slot(kvm_context, start, size);
2492 else {
2493 #ifdef TARGET_I386
2494 if (must_use_aliases_target(start))
2495 return;
2496 #endif
2497 kvm_dirty_pages_log_disable_slot(kvm_context, start, size);
2501 #ifdef KVM_CAP_IRQCHIP
2503 int kvm_set_irq(int irq, int level, int *status)
2505 return kvm_set_irq_level(kvm_context, irq, level, status);
2508 #endif
2510 int qemu_kvm_get_dirty_pages(unsigned long phys_addr, void *buf)
2512 return kvm_get_dirty_pages(kvm_context, phys_addr, buf);
2515 void kvm_mutex_unlock(void)
2517 assert(!cpu_single_env);
2518 pthread_mutex_unlock(&qemu_mutex);
2521 void kvm_mutex_lock(void)
2523 pthread_mutex_lock(&qemu_mutex);
2524 cpu_single_env = NULL;
2527 void qemu_mutex_unlock_iothread(void)
2529 if (kvm_enabled())
2530 kvm_mutex_unlock();
2533 void qemu_mutex_lock_iothread(void)
2535 if (kvm_enabled())
2536 kvm_mutex_lock();
2539 #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
2540 void kvm_add_ioperm_data(struct ioperm_data *data)
2542 QLIST_INSERT_HEAD(&ioperm_head, data, entries);
2545 void kvm_remove_ioperm_data(unsigned long start_port, unsigned long num)
2547 struct ioperm_data *data;
2549 data = QLIST_FIRST(&ioperm_head);
2550 while (data) {
2551 struct ioperm_data *next = QLIST_NEXT(data, entries);
2553 if (data->start_port == start_port && data->num == num) {
2554 QLIST_REMOVE(data, entries);
2555 qemu_free(data);
2558 data = next;
2562 void kvm_ioperm(CPUState *env, void *data)
2564 if (kvm_enabled() && qemu_system_ready)
2565 on_vcpu(env, kvm_arch_do_ioperm, data);
2568 #endif
2570 int kvm_physical_sync_dirty_bitmap(target_phys_addr_t start_addr,
2571 target_phys_addr_t end_addr)
2573 #ifndef TARGET_IA64
2575 #ifdef TARGET_I386
2576 if (must_use_aliases_source(start_addr))
2577 return 0;
2578 #endif
2580 kvm_get_dirty_pages_range(kvm_context, start_addr,
2581 end_addr - start_addr, NULL,
2582 kvm_get_dirty_bitmap_cb);
2583 #endif
2584 return 0;
2587 int kvm_log_start(target_phys_addr_t phys_addr, target_phys_addr_t len)
2589 #ifdef TARGET_I386
2590 if (must_use_aliases_source(phys_addr))
2591 return 0;
2592 #endif
2594 #ifndef TARGET_IA64
2595 kvm_qemu_log_memory(phys_addr, len, 1);
2596 #endif
2597 return 0;
2600 int kvm_log_stop(target_phys_addr_t phys_addr, target_phys_addr_t len)
2602 #ifdef TARGET_I386
2603 if (must_use_aliases_source(phys_addr))
2604 return 0;
2605 #endif
2607 #ifndef TARGET_IA64
2608 kvm_qemu_log_memory(phys_addr, len, 0);
2609 #endif
2610 return 0;
2613 int kvm_set_boot_cpu_id(uint32_t id)
2615 return kvm_set_boot_vcpu_id(kvm_context, id);
2618 #ifdef TARGET_I386
2619 #ifdef KVM_CAP_MCE
2620 struct kvm_x86_mce_data {
2621 CPUState *env;
2622 struct kvm_x86_mce *mce;
2623 int abort_on_error;
2626 static void kvm_do_inject_x86_mce(void *_data)
2628 struct kvm_x86_mce_data *data = _data;
2629 int r;
2631 r = kvm_set_mce(data->env, data->mce);
2632 if (r < 0) {
2633 perror("kvm_set_mce FAILED");
2634 if (data->abort_on_error)
2635 abort();
2638 #endif
2640 void kvm_inject_x86_mce(CPUState *cenv, int bank, uint64_t status,
2641 uint64_t mcg_status, uint64_t addr, uint64_t misc,
2642 int abort_on_error)
2644 #ifdef KVM_CAP_MCE
2645 struct kvm_x86_mce mce = {
2646 .bank = bank,
2647 .status = status,
2648 .mcg_status = mcg_status,
2649 .addr = addr,
2650 .misc = misc,
2652 struct kvm_x86_mce_data data = {
2653 .env = cenv,
2654 .mce = &mce,
2655 .abort_on_error = abort_on_error,
2658 if (!cenv->mcg_cap) {
2659 fprintf(stderr, "MCE support is not enabled!\n");
2660 return;
2662 on_vcpu(cenv, kvm_do_inject_x86_mce, &data);
2663 #else
2664 if (abort_on_error)
2665 abort();
2666 #endif
2668 #endif