Merge commit 'e890261f671a0573efbc024972d8769423fc82fc' into upstream-merge
[qemu-kvm/stefanha.git] / qemu-kvm.c
blobbac5b8659e8a2074425c912f44a3b8047d7a88ed
1 /*
2 * qemu/kvm integration
4 * Copyright (C) 2006-2008 Qumranet Technologies
6 * Licensed under the terms of the GNU GPL version 2 or higher.
7 */
8 #include "config.h"
9 #include "config-host.h"
11 #include <assert.h>
12 #include <string.h>
13 #include "hw/hw.h"
14 #include "sysemu.h"
15 #include "qemu-common.h"
16 #include "console.h"
17 #include "block.h"
18 #include "compatfd.h"
19 #include "gdbstub.h"
20 #include "monitor.h"
22 #include "qemu-kvm.h"
23 #include "libkvm.h"
25 #include <pthread.h>
26 #include <sys/utsname.h>
27 #include <sys/syscall.h>
28 #include <sys/mman.h>
29 #include <sys/ioctl.h>
30 #include "compatfd.h"
31 #include <sys/prctl.h>
33 #define false 0
34 #define true 1
36 #ifndef PR_MCE_KILL
37 #define PR_MCE_KILL 33
38 #endif
40 #ifndef BUS_MCEERR_AR
41 #define BUS_MCEERR_AR 4
42 #endif
43 #ifndef BUS_MCEERR_AO
44 #define BUS_MCEERR_AO 5
45 #endif
47 #define EXPECTED_KVM_API_VERSION 12
49 #if EXPECTED_KVM_API_VERSION != KVM_API_VERSION
50 #error libkvm: userspace and kernel version mismatch
51 #endif
53 int kvm_irqchip = 1;
54 int kvm_pit = 1;
55 int kvm_pit_reinject = 1;
56 int kvm_nested = 0;
59 KVMState *kvm_state;
60 kvm_context_t kvm_context;
62 pthread_mutex_t qemu_mutex = PTHREAD_MUTEX_INITIALIZER;
63 pthread_cond_t qemu_vcpu_cond = PTHREAD_COND_INITIALIZER;
64 pthread_cond_t qemu_system_cond = PTHREAD_COND_INITIALIZER;
65 pthread_cond_t qemu_pause_cond = PTHREAD_COND_INITIALIZER;
66 pthread_cond_t qemu_work_cond = PTHREAD_COND_INITIALIZER;
67 __thread CPUState *current_env;
69 static int qemu_system_ready;
71 #define SIG_IPI (SIGRTMIN+4)
73 pthread_t io_thread;
74 static int io_thread_sigfd = -1;
76 static CPUState *kvm_debug_cpu_requested;
78 #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
79 /* The list of ioperm_data */
80 static QLIST_HEAD(, ioperm_data) ioperm_head;
81 #endif
83 #define ALIGN(x, y) (((x)+(y)-1) & ~((y)-1))
85 int kvm_abi = EXPECTED_KVM_API_VERSION;
86 int kvm_page_size;
88 #ifdef KVM_CAP_SET_GUEST_DEBUG
89 static int kvm_debug(CPUState *env,
90 struct kvm_debug_exit_arch *arch_info)
92 int handle = kvm_arch_debug(arch_info);
94 if (handle) {
95 kvm_debug_cpu_requested = env;
96 env->stopped = 1;
98 return handle;
100 #endif
102 static int handle_unhandled(uint64_t reason)
104 fprintf(stderr, "kvm: unhandled exit %" PRIx64 "\n", reason);
105 return -EINVAL;
108 #define VMX_INVALID_GUEST_STATE 0x80000021
110 static int handle_failed_vmentry(uint64_t reason)
112 fprintf(stderr, "kvm: vm entry failed with error 0x%" PRIx64 "\n\n", reason);
114 /* Perhaps we will need to check if this machine is intel since exit reason 0x21
115 has a different interpretation on SVM */
116 if (reason == VMX_INVALID_GUEST_STATE) {
117 fprintf(stderr, "If you're runnning a guest on an Intel machine without\n");
118 fprintf(stderr, "unrestricted mode support, the failure can be most likely\n");
119 fprintf(stderr, "due to the guest entering an invalid state for Intel VT.\n");
120 fprintf(stderr, "For example, the guest maybe running in big real mode\n");
121 fprintf(stderr, "which is not supported on less recent Intel processors.\n\n");
124 return -EINVAL;
127 static inline void set_gsi(kvm_context_t kvm, unsigned int gsi)
129 uint32_t *bitmap = kvm->used_gsi_bitmap;
131 if (gsi < kvm->max_gsi)
132 bitmap[gsi / 32] |= 1U << (gsi % 32);
133 else
134 DPRINTF("Invalid GSI %u\n", gsi);
137 static inline void clear_gsi(kvm_context_t kvm, unsigned int gsi)
139 uint32_t *bitmap = kvm->used_gsi_bitmap;
141 if (gsi < kvm->max_gsi)
142 bitmap[gsi / 32] &= ~(1U << (gsi % 32));
143 else
144 DPRINTF("Invalid GSI %u\n", gsi);
147 static int kvm_create_context(void);
149 int kvm_init(int smp_cpus)
151 int fd;
152 int r, gsi_count;
155 fd = open("/dev/kvm", O_RDWR);
156 if (fd == -1) {
157 perror("open /dev/kvm");
158 return -1;
160 r = ioctl(fd, KVM_GET_API_VERSION, 0);
161 if (r == -1) {
162 fprintf(stderr,
163 "kvm kernel version too old: "
164 "KVM_GET_API_VERSION ioctl not supported\n");
165 goto out_close;
167 if (r < EXPECTED_KVM_API_VERSION) {
168 fprintf(stderr, "kvm kernel version too old: "
169 "We expect API version %d or newer, but got "
170 "version %d\n", EXPECTED_KVM_API_VERSION, r);
171 goto out_close;
173 if (r > EXPECTED_KVM_API_VERSION) {
174 fprintf(stderr, "kvm userspace version too old\n");
175 goto out_close;
177 kvm_abi = r;
178 kvm_page_size = getpagesize();
179 kvm_state = qemu_mallocz(sizeof(*kvm_state));
180 kvm_context = &kvm_state->kvm_context;
182 kvm_state->fd = fd;
183 kvm_state->vmfd = -1;
184 kvm_context->opaque = cpu_single_env;
185 kvm_context->dirty_pages_log_all = 0;
186 kvm_context->no_irqchip_creation = 0;
187 kvm_context->no_pit_creation = 0;
189 #ifdef KVM_CAP_SET_GUEST_DEBUG
190 QTAILQ_INIT(&kvm_state->kvm_sw_breakpoints);
191 #endif
193 gsi_count = kvm_get_gsi_count(kvm_context);
194 if (gsi_count > 0) {
195 int gsi_bits, i;
197 /* Round up so we can search ints using ffs */
198 gsi_bits = ALIGN(gsi_count, 32);
199 kvm_context->used_gsi_bitmap = qemu_mallocz(gsi_bits / 8);
200 kvm_context->max_gsi = gsi_bits;
202 /* Mark any over-allocated bits as already in use */
203 for (i = gsi_count; i < gsi_bits; i++) {
204 set_gsi(kvm_context, i);
208 kvm_cpu_register_phys_memory_client();
210 pthread_mutex_lock(&qemu_mutex);
211 return kvm_create_context();
213 out_close:
214 close(fd);
215 return -1;
218 static void kvm_finalize(KVMState *s)
220 /* FIXME
221 if (kvm->vcpu_fd[0] != -1)
222 close(kvm->vcpu_fd[0]);
223 if (kvm->vm_fd != -1)
224 close(kvm->vm_fd);
226 close(s->fd);
227 free(s);
230 void kvm_disable_irqchip_creation(kvm_context_t kvm)
232 kvm->no_irqchip_creation = 1;
235 void kvm_disable_pit_creation(kvm_context_t kvm)
237 kvm->no_pit_creation = 1;
240 static void kvm_reset_vcpu(void *opaque)
242 CPUState *env = opaque;
244 kvm_arch_cpu_reset(env);
247 static void kvm_create_vcpu(CPUState *env, int id)
249 long mmap_size;
250 int r;
251 KVMState *s = kvm_state;
253 r = kvm_vm_ioctl(kvm_state, KVM_CREATE_VCPU, id);
254 if (r < 0) {
255 fprintf(stderr, "kvm_create_vcpu: %m\n");
256 fprintf(stderr, "Failed to create vCPU. Check the -smp parameter.\n");
257 goto err;
260 env->kvm_fd = r;
261 env->kvm_state = kvm_state;
263 mmap_size = kvm_ioctl(kvm_state, KVM_GET_VCPU_MMAP_SIZE, 0);
264 if (mmap_size < 0) {
265 fprintf(stderr, "get vcpu mmap size: %m\n");
266 goto err_fd;
268 env->kvm_run =
269 mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, env->kvm_fd,
271 if (env->kvm_run == MAP_FAILED) {
272 fprintf(stderr, "mmap vcpu area: %m\n");
273 goto err_fd;
276 #ifdef KVM_CAP_COALESCED_MMIO
277 if (s->coalesced_mmio && !s->coalesced_mmio_ring)
278 s->coalesced_mmio_ring = (void *) env->kvm_run +
279 s->coalesced_mmio * PAGE_SIZE;
280 #endif
282 r = kvm_arch_init_vcpu(env);
283 if (r == 0) {
284 qemu_register_reset(kvm_reset_vcpu, env);
287 return;
288 err_fd:
289 close(env->kvm_fd);
290 err:
291 /* We're no good with semi-broken states. */
292 abort();
295 static int kvm_set_boot_vcpu_id(kvm_context_t kvm, uint32_t id)
297 #ifdef KVM_CAP_SET_BOOT_CPU_ID
298 int r = kvm_ioctl(kvm_state, KVM_CHECK_EXTENSION, KVM_CAP_SET_BOOT_CPU_ID);
299 if (r > 0) {
300 return kvm_vm_ioctl(kvm_state, KVM_SET_BOOT_CPU_ID, id);
302 return -ENOSYS;
303 #else
304 return -ENOSYS;
305 #endif
308 int kvm_create_vm(kvm_context_t kvm)
310 int fd;
311 #ifdef KVM_CAP_IRQ_ROUTING
312 kvm->irq_routes = qemu_mallocz(sizeof(*kvm->irq_routes));
313 kvm->nr_allocated_irq_routes = 0;
314 #endif
316 fd = kvm_ioctl(kvm_state, KVM_CREATE_VM, 0);
317 if (fd < 0) {
318 fprintf(stderr, "kvm_create_vm: %m\n");
319 return -1;
321 kvm_state->vmfd = fd;
322 return 0;
325 static int kvm_create_default_phys_mem(kvm_context_t kvm,
326 unsigned long phys_mem_bytes,
327 void **vm_mem)
329 #ifdef KVM_CAP_USER_MEMORY
330 int r = kvm_ioctl(kvm_state, KVM_CHECK_EXTENSION, KVM_CAP_USER_MEMORY);
331 if (r > 0)
332 return 0;
333 fprintf(stderr,
334 "Hypervisor too old: KVM_CAP_USER_MEMORY extension not supported\n");
335 #else
336 #error Hypervisor too old: KVM_CAP_USER_MEMORY extension not supported
337 #endif
338 return -1;
341 void kvm_create_irqchip(kvm_context_t kvm)
343 int r;
345 kvm->irqchip_in_kernel = 0;
346 #ifdef KVM_CAP_IRQCHIP
347 if (!kvm->no_irqchip_creation) {
348 r = kvm_ioctl(kvm_state, KVM_CHECK_EXTENSION, KVM_CAP_IRQCHIP);
349 if (r > 0) { /* kernel irqchip supported */
350 r = kvm_vm_ioctl(kvm_state, KVM_CREATE_IRQCHIP);
351 if (r >= 0) {
352 kvm->irqchip_inject_ioctl = KVM_IRQ_LINE;
353 #if defined(KVM_CAP_IRQ_INJECT_STATUS) && defined(KVM_IRQ_LINE_STATUS)
354 r = kvm_ioctl(kvm_state, KVM_CHECK_EXTENSION,
355 KVM_CAP_IRQ_INJECT_STATUS);
356 if (r > 0) {
357 kvm->irqchip_inject_ioctl = KVM_IRQ_LINE_STATUS;
359 #endif
360 kvm->irqchip_in_kernel = 1;
361 } else
362 fprintf(stderr, "Create kernel PIC irqchip failed\n");
365 #endif
366 kvm_state->irqchip_in_kernel = kvm->irqchip_in_kernel;
369 int kvm_create(kvm_context_t kvm, unsigned long phys_mem_bytes, void **vm_mem)
371 int r, i;
373 r = kvm_create_vm(kvm);
374 if (r < 0) {
375 return r;
377 r = kvm_arch_create(kvm, phys_mem_bytes, vm_mem);
378 if (r < 0) {
379 return r;
381 for (i = 0; i < ARRAY_SIZE(kvm_state->slots); i++) {
382 kvm_state->slots[i].slot = i;
385 r = kvm_create_default_phys_mem(kvm, phys_mem_bytes, vm_mem);
386 if (r < 0) {
387 return r;
390 kvm_create_irqchip(kvm);
392 return 0;
395 #ifdef KVM_CAP_IRQCHIP
397 int kvm_set_irq_level(kvm_context_t kvm, int irq, int level, int *status)
399 struct kvm_irq_level event;
400 int r;
402 if (!kvm->irqchip_in_kernel) {
403 return 0;
405 event.level = level;
406 event.irq = irq;
407 r = kvm_vm_ioctl(kvm_state, kvm->irqchip_inject_ioctl, &event);
408 if (r < 0) {
409 perror("kvm_set_irq_level");
412 if (status) {
413 #ifdef KVM_CAP_IRQ_INJECT_STATUS
414 *status =
415 (kvm->irqchip_inject_ioctl == KVM_IRQ_LINE) ? 1 : event.status;
416 #else
417 *status = 1;
418 #endif
421 return 1;
424 int kvm_get_irqchip(kvm_context_t kvm, struct kvm_irqchip *chip)
426 int r;
428 if (!kvm->irqchip_in_kernel) {
429 return 0;
431 r = kvm_vm_ioctl(kvm_state, KVM_GET_IRQCHIP, chip);
432 if (r < 0) {
433 perror("kvm_get_irqchip\n");
435 return r;
438 int kvm_set_irqchip(kvm_context_t kvm, struct kvm_irqchip *chip)
440 int r;
442 if (!kvm->irqchip_in_kernel) {
443 return 0;
445 r = kvm_vm_ioctl(kvm_state, KVM_SET_IRQCHIP, chip);
446 if (r < 0) {
447 perror("kvm_set_irqchip\n");
449 return r;
452 #endif
454 static int handle_debug(CPUState *env)
456 #ifdef KVM_CAP_SET_GUEST_DEBUG
457 struct kvm_run *run = env->kvm_run;
459 return kvm_debug(env, &run->debug.arch);
460 #else
461 return 0;
462 #endif
465 int kvm_get_regs(CPUState *env, struct kvm_regs *regs)
467 return kvm_vcpu_ioctl(env, KVM_GET_REGS, regs);
470 int kvm_set_regs(CPUState *env, struct kvm_regs *regs)
472 return kvm_vcpu_ioctl(env, KVM_SET_REGS, regs);
475 int kvm_get_fpu(CPUState *env, struct kvm_fpu *fpu)
477 return kvm_vcpu_ioctl(env, KVM_GET_FPU, fpu);
480 int kvm_set_fpu(CPUState *env, struct kvm_fpu *fpu)
482 return kvm_vcpu_ioctl(env, KVM_SET_FPU, fpu);
485 int kvm_get_sregs(CPUState *env, struct kvm_sregs *sregs)
487 return kvm_vcpu_ioctl(env, KVM_GET_SREGS, sregs);
490 int kvm_set_sregs(CPUState *env, struct kvm_sregs *sregs)
492 return kvm_vcpu_ioctl(env, KVM_SET_SREGS, sregs);
495 #ifdef KVM_CAP_MP_STATE
496 int kvm_get_mpstate(CPUState *env, struct kvm_mp_state *mp_state)
498 int r;
500 r = kvm_ioctl(kvm_state, KVM_CHECK_EXTENSION, KVM_CAP_MP_STATE);
501 if (r > 0) {
502 return kvm_vcpu_ioctl(env, KVM_GET_MP_STATE, mp_state);
504 return -ENOSYS;
507 int kvm_set_mpstate(CPUState *env, struct kvm_mp_state *mp_state)
509 int r;
511 r = kvm_ioctl(kvm_state, KVM_CHECK_EXTENSION, KVM_CAP_MP_STATE);
512 if (r > 0) {
513 return kvm_vcpu_ioctl(env, KVM_SET_MP_STATE, mp_state);
515 return -ENOSYS;
517 #endif
519 #ifdef KVM_CAP_XSAVE
520 int kvm_get_xsave(CPUState *env, struct kvm_xsave *xsave)
522 return kvm_vcpu_ioctl(env, KVM_GET_XSAVE, xsave);
525 int kvm_set_xsave(CPUState *env, struct kvm_xsave *xsave)
527 return kvm_vcpu_ioctl(env, KVM_SET_XSAVE, xsave);
529 #endif
531 #ifdef KVM_CAP_XCRS
532 int kvm_get_xcrs(CPUState *env, struct kvm_xcrs *xcrs)
534 return kvm_vcpu_ioctl(env, KVM_GET_XCRS, xcrs);
537 int kvm_set_xcrs(CPUState *env, struct kvm_xcrs *xcrs)
539 return kvm_vcpu_ioctl(env, KVM_SET_XCRS, xcrs);
541 #endif
543 static int handle_mmio(CPUState *env)
545 unsigned long addr = env->kvm_run->mmio.phys_addr;
546 struct kvm_run *kvm_run = env->kvm_run;
547 void *data = kvm_run->mmio.data;
549 /* hack: Red Hat 7.1 generates these weird accesses. */
550 if ((addr > 0xa0000 - 4 && addr <= 0xa0000) && kvm_run->mmio.len == 3) {
551 return 0;
554 cpu_physical_memory_rw(addr, data, kvm_run->mmio.len, kvm_run->mmio.is_write);
555 return 0;
558 int handle_io_window(kvm_context_t kvm)
560 return 1;
563 int handle_shutdown(kvm_context_t kvm, CPUState *env)
565 /* stop the current vcpu from going back to guest mode */
566 env->stopped = 1;
568 qemu_system_reset_request();
569 return 1;
572 static inline void push_nmi(kvm_context_t kvm)
574 #ifdef KVM_CAP_USER_NMI
575 kvm_arch_push_nmi(kvm->opaque);
576 #endif /* KVM_CAP_USER_NMI */
579 void post_kvm_run(kvm_context_t kvm, CPUState *env)
581 pthread_mutex_lock(&qemu_mutex);
582 kvm_arch_post_run(env, env->kvm_run);
583 cpu_single_env = env;
586 int pre_kvm_run(kvm_context_t kvm, CPUState *env)
588 kvm_arch_pre_run(env, env->kvm_run);
590 pthread_mutex_unlock(&qemu_mutex);
591 return 0;
594 int kvm_is_ready_for_interrupt_injection(CPUState *env)
596 return env->kvm_run->ready_for_interrupt_injection;
599 int kvm_run(CPUState *env)
601 int r;
602 kvm_context_t kvm = &env->kvm_state->kvm_context;
603 struct kvm_run *run = env->kvm_run;
604 int fd = env->kvm_fd;
606 again:
607 if (env->kvm_vcpu_dirty) {
608 kvm_arch_load_regs(env, KVM_PUT_RUNTIME_STATE);
609 env->kvm_vcpu_dirty = 0;
611 push_nmi(kvm);
612 #if !defined(__s390__)
613 if (!kvm->irqchip_in_kernel) {
614 run->request_interrupt_window = kvm_arch_try_push_interrupts(env);
616 #endif
618 r = pre_kvm_run(kvm, env);
619 if (r) {
620 return r;
622 if (env->exit_request) {
623 env->exit_request = 0;
624 pthread_kill(env->kvm_cpu_state.thread, SIG_IPI);
626 r = ioctl(fd, KVM_RUN, 0);
628 if (r == -1 && errno != EINTR && errno != EAGAIN) {
629 r = -errno;
630 post_kvm_run(kvm, env);
631 fprintf(stderr, "kvm_run: %s\n", strerror(-r));
632 return r;
635 post_kvm_run(kvm, env);
637 kvm_flush_coalesced_mmio_buffer();
639 #if !defined(__s390__)
640 if (r == -1) {
641 r = handle_io_window(kvm);
642 goto more;
644 #endif
645 if (1) {
646 switch (run->exit_reason) {
647 case KVM_EXIT_UNKNOWN:
648 r = handle_unhandled(run->hw.hardware_exit_reason);
649 break;
650 case KVM_EXIT_FAIL_ENTRY:
651 r = handle_failed_vmentry(run->fail_entry.hardware_entry_failure_reason);
652 break;
653 case KVM_EXIT_EXCEPTION:
654 fprintf(stderr, "exception %d (%x)\n", run->ex.exception,
655 run->ex.error_code);
656 kvm_show_regs(env);
657 kvm_show_code(env);
658 abort();
659 break;
660 case KVM_EXIT_IO:
661 r = kvm_handle_io(run->io.port,
662 (uint8_t *)run + run->io.data_offset,
663 run->io.direction,
664 run->io.size,
665 run->io.count);
666 r = 0;
667 break;
668 case KVM_EXIT_DEBUG:
669 r = handle_debug(env);
670 break;
671 case KVM_EXIT_MMIO:
672 r = handle_mmio(env);
673 break;
674 case KVM_EXIT_HLT:
675 r = kvm_arch_halt(env);
676 break;
677 case KVM_EXIT_IRQ_WINDOW_OPEN:
678 break;
679 case KVM_EXIT_SHUTDOWN:
680 r = handle_shutdown(kvm, env);
681 break;
682 #if defined(__s390__)
683 case KVM_EXIT_S390_SIEIC:
684 r = kvm_s390_handle_intercept(kvm, env, run);
685 break;
686 case KVM_EXIT_S390_RESET:
687 r = kvm_s390_handle_reset(kvm, env, run);
688 break;
689 #endif
690 case KVM_EXIT_INTERNAL_ERROR:
691 kvm_handle_internal_error(env, run);
692 r = 1;
693 break;
694 default:
695 if (kvm_arch_run(env)) {
696 fprintf(stderr, "unhandled vm exit: 0x%x\n", run->exit_reason);
697 kvm_show_regs(env);
698 abort();
700 break;
703 more:
704 if (!r) {
705 goto again;
707 return r;
710 int kvm_inject_irq(CPUState *env, unsigned irq)
712 struct kvm_interrupt intr;
714 intr.irq = irq;
715 return kvm_vcpu_ioctl(env, KVM_INTERRUPT, &intr);
718 int kvm_inject_nmi(CPUState *env)
720 #ifdef KVM_CAP_USER_NMI
721 return kvm_vcpu_ioctl(env, KVM_NMI);
722 #else
723 return -ENOSYS;
724 #endif
727 int kvm_init_coalesced_mmio(kvm_context_t kvm)
729 int r = 0;
730 kvm_state->coalesced_mmio = 0;
731 #ifdef KVM_CAP_COALESCED_MMIO
732 r = kvm_ioctl(kvm_state, KVM_CHECK_EXTENSION, KVM_CAP_COALESCED_MMIO);
733 if (r > 0) {
734 kvm_state->coalesced_mmio = r;
735 return 0;
737 #endif
738 return r;
741 #ifdef KVM_CAP_DEVICE_ASSIGNMENT
742 int kvm_assign_pci_device(kvm_context_t kvm,
743 struct kvm_assigned_pci_dev *assigned_dev)
745 return kvm_vm_ioctl(kvm_state, KVM_ASSIGN_PCI_DEVICE, assigned_dev);
748 static int kvm_old_assign_irq(kvm_context_t kvm,
749 struct kvm_assigned_irq *assigned_irq)
751 return kvm_vm_ioctl(kvm_state, KVM_ASSIGN_IRQ, assigned_irq);
754 #ifdef KVM_CAP_ASSIGN_DEV_IRQ
755 int kvm_assign_irq(kvm_context_t kvm, struct kvm_assigned_irq *assigned_irq)
757 int ret;
759 ret = kvm_ioctl(kvm_state, KVM_CHECK_EXTENSION, KVM_CAP_ASSIGN_DEV_IRQ);
760 if (ret > 0) {
761 return kvm_vm_ioctl(kvm_state, KVM_ASSIGN_DEV_IRQ, assigned_irq);
764 return kvm_old_assign_irq(kvm, assigned_irq);
767 int kvm_deassign_irq(kvm_context_t kvm, struct kvm_assigned_irq *assigned_irq)
769 return kvm_vm_ioctl(kvm_state, KVM_DEASSIGN_DEV_IRQ, assigned_irq);
771 #else
772 int kvm_assign_irq(kvm_context_t kvm, struct kvm_assigned_irq *assigned_irq)
774 return kvm_old_assign_irq(kvm, assigned_irq);
776 #endif
777 #endif
779 #ifdef KVM_CAP_DEVICE_DEASSIGNMENT
780 int kvm_deassign_pci_device(kvm_context_t kvm,
781 struct kvm_assigned_pci_dev *assigned_dev)
783 return kvm_vm_ioctl(kvm_state, KVM_DEASSIGN_PCI_DEVICE, assigned_dev);
785 #endif
787 int kvm_reinject_control(kvm_context_t kvm, int pit_reinject)
789 #ifdef KVM_CAP_REINJECT_CONTROL
790 int r;
791 struct kvm_reinject_control control;
793 control.pit_reinject = pit_reinject;
795 r = kvm_ioctl(kvm_state, KVM_CHECK_EXTENSION, KVM_CAP_REINJECT_CONTROL);
796 if (r > 0) {
797 return kvm_vm_ioctl(kvm_state, KVM_REINJECT_CONTROL, &control);
799 #endif
800 return -ENOSYS;
803 int kvm_has_gsi_routing(void)
805 int r = 0;
807 #ifdef KVM_CAP_IRQ_ROUTING
808 r = kvm_check_extension(kvm_state, KVM_CAP_IRQ_ROUTING);
809 #endif
810 return r;
813 int kvm_get_gsi_count(kvm_context_t kvm)
815 #ifdef KVM_CAP_IRQ_ROUTING
816 return kvm_check_extension(kvm_state, KVM_CAP_IRQ_ROUTING);
817 #else
818 return -EINVAL;
819 #endif
822 int kvm_clear_gsi_routes(void)
824 #ifdef KVM_CAP_IRQ_ROUTING
825 kvm_context_t kvm = kvm_context;
827 kvm->irq_routes->nr = 0;
828 return 0;
829 #else
830 return -EINVAL;
831 #endif
834 int kvm_add_routing_entry(struct kvm_irq_routing_entry *entry)
836 #ifdef KVM_CAP_IRQ_ROUTING
837 kvm_context_t kvm = kvm_context;
838 struct kvm_irq_routing *z;
839 struct kvm_irq_routing_entry *new;
840 int n, size;
842 if (kvm->irq_routes->nr == kvm->nr_allocated_irq_routes) {
843 n = kvm->nr_allocated_irq_routes * 2;
844 if (n < 64) {
845 n = 64;
847 size = sizeof(struct kvm_irq_routing);
848 size += n * sizeof(*new);
849 z = realloc(kvm->irq_routes, size);
850 if (!z) {
851 return -ENOMEM;
853 kvm->nr_allocated_irq_routes = n;
854 kvm->irq_routes = z;
856 n = kvm->irq_routes->nr++;
857 new = &kvm->irq_routes->entries[n];
858 memset(new, 0, sizeof(*new));
859 new->gsi = entry->gsi;
860 new->type = entry->type;
861 new->flags = entry->flags;
862 new->u = entry->u;
864 set_gsi(kvm, entry->gsi);
866 return 0;
867 #else
868 return -ENOSYS;
869 #endif
872 int kvm_add_irq_route(int gsi, int irqchip, int pin)
874 #ifdef KVM_CAP_IRQ_ROUTING
875 struct kvm_irq_routing_entry e;
877 e.gsi = gsi;
878 e.type = KVM_IRQ_ROUTING_IRQCHIP;
879 e.flags = 0;
880 e.u.irqchip.irqchip = irqchip;
881 e.u.irqchip.pin = pin;
882 return kvm_add_routing_entry(&e);
883 #else
884 return -ENOSYS;
885 #endif
888 int kvm_del_routing_entry(struct kvm_irq_routing_entry *entry)
890 #ifdef KVM_CAP_IRQ_ROUTING
891 kvm_context_t kvm = kvm_context;
892 struct kvm_irq_routing_entry *e, *p;
893 int i, gsi, found = 0;
895 gsi = entry->gsi;
897 for (i = 0; i < kvm->irq_routes->nr; ++i) {
898 e = &kvm->irq_routes->entries[i];
899 if (e->type == entry->type && e->gsi == gsi) {
900 switch (e->type) {
901 case KVM_IRQ_ROUTING_IRQCHIP:{
902 if (e->u.irqchip.irqchip ==
903 entry->u.irqchip.irqchip
904 && e->u.irqchip.pin == entry->u.irqchip.pin) {
905 p = &kvm->irq_routes->entries[--kvm->irq_routes->nr];
906 *e = *p;
907 found = 1;
909 break;
911 case KVM_IRQ_ROUTING_MSI:{
912 if (e->u.msi.address_lo ==
913 entry->u.msi.address_lo
914 && e->u.msi.address_hi ==
915 entry->u.msi.address_hi
916 && e->u.msi.data == entry->u.msi.data) {
917 p = &kvm->irq_routes->entries[--kvm->irq_routes->nr];
918 *e = *p;
919 found = 1;
921 break;
923 default:
924 break;
926 if (found) {
927 /* If there are no other users of this GSI
928 * mark it available in the bitmap */
929 for (i = 0; i < kvm->irq_routes->nr; i++) {
930 e = &kvm->irq_routes->entries[i];
931 if (e->gsi == gsi)
932 break;
934 if (i == kvm->irq_routes->nr) {
935 clear_gsi(kvm, gsi);
938 return 0;
942 return -ESRCH;
943 #else
944 return -ENOSYS;
945 #endif
948 int kvm_update_routing_entry(struct kvm_irq_routing_entry *entry,
949 struct kvm_irq_routing_entry *newentry)
951 #ifdef KVM_CAP_IRQ_ROUTING
952 kvm_context_t kvm = kvm_context;
953 struct kvm_irq_routing_entry *e;
954 int i;
956 if (entry->gsi != newentry->gsi || entry->type != newentry->type) {
957 return -EINVAL;
960 for (i = 0; i < kvm->irq_routes->nr; ++i) {
961 e = &kvm->irq_routes->entries[i];
962 if (e->type != entry->type || e->gsi != entry->gsi) {
963 continue;
965 switch (e->type) {
966 case KVM_IRQ_ROUTING_IRQCHIP:
967 if (e->u.irqchip.irqchip == entry->u.irqchip.irqchip &&
968 e->u.irqchip.pin == entry->u.irqchip.pin) {
969 memcpy(&e->u.irqchip, &newentry->u.irqchip,
970 sizeof e->u.irqchip);
971 return 0;
973 break;
974 case KVM_IRQ_ROUTING_MSI:
975 if (e->u.msi.address_lo == entry->u.msi.address_lo &&
976 e->u.msi.address_hi == entry->u.msi.address_hi &&
977 e->u.msi.data == entry->u.msi.data) {
978 memcpy(&e->u.msi, &newentry->u.msi, sizeof e->u.msi);
979 return 0;
981 break;
982 default:
983 break;
986 return -ESRCH;
987 #else
988 return -ENOSYS;
989 #endif
992 int kvm_del_irq_route(int gsi, int irqchip, int pin)
994 #ifdef KVM_CAP_IRQ_ROUTING
995 struct kvm_irq_routing_entry e;
997 e.gsi = gsi;
998 e.type = KVM_IRQ_ROUTING_IRQCHIP;
999 e.flags = 0;
1000 e.u.irqchip.irqchip = irqchip;
1001 e.u.irqchip.pin = pin;
1002 return kvm_del_routing_entry(&e);
1003 #else
1004 return -ENOSYS;
1005 #endif
1008 int kvm_commit_irq_routes(void)
1010 #ifdef KVM_CAP_IRQ_ROUTING
1011 kvm_context_t kvm = kvm_context;
1013 kvm->irq_routes->flags = 0;
1014 return kvm_vm_ioctl(kvm_state, KVM_SET_GSI_ROUTING, kvm->irq_routes);
1015 #else
1016 return -ENOSYS;
1017 #endif
1020 int kvm_get_irq_route_gsi(void)
1022 kvm_context_t kvm = kvm_context;
1023 int i, bit;
1024 uint32_t *buf = kvm->used_gsi_bitmap;
1026 /* Return the lowest unused GSI in the bitmap */
1027 for (i = 0; i < kvm->max_gsi / 32; i++) {
1028 bit = ffs(~buf[i]);
1029 if (!bit) {
1030 continue;
1033 return bit - 1 + i * 32;
1036 return -ENOSPC;
1039 static void kvm_msix_routing_entry(struct kvm_irq_routing_entry *e,
1040 uint32_t gsi, uint32_t addr_lo,
1041 uint32_t addr_hi, uint32_t data)
1044 e->gsi = gsi;
1045 e->type = KVM_IRQ_ROUTING_MSI;
1046 e->flags = 0;
1047 e->u.msi.address_lo = addr_lo;
1048 e->u.msi.address_hi = addr_hi;
1049 e->u.msi.data = data;
1052 int kvm_add_msix(uint32_t gsi, uint32_t addr_lo,
1053 uint32_t addr_hi, uint32_t data)
1055 struct kvm_irq_routing_entry e;
1057 kvm_msix_routing_entry(&e, gsi, addr_lo, addr_hi, data);
1058 return kvm_add_routing_entry(&e);
1061 int kvm_del_msix(uint32_t gsi, uint32_t addr_lo,
1062 uint32_t addr_hi, uint32_t data)
1064 struct kvm_irq_routing_entry e;
1066 kvm_msix_routing_entry(&e, gsi, addr_lo, addr_hi, data);
1067 return kvm_del_routing_entry(&e);
1070 int kvm_update_msix(uint32_t old_gsi, uint32_t old_addr_lo,
1071 uint32_t old_addr_hi, uint32_t old_data,
1072 uint32_t new_gsi, uint32_t new_addr_lo,
1073 uint32_t new_addr_hi, uint32_t new_data)
1075 struct kvm_irq_routing_entry e1, e2;
1077 kvm_msix_routing_entry(&e1, old_gsi, old_addr_lo, old_addr_hi, old_data);
1078 kvm_msix_routing_entry(&e2, new_gsi, new_addr_lo, new_addr_hi, new_data);
1079 return kvm_update_routing_entry(&e1, &e2);
1083 #ifdef KVM_CAP_DEVICE_MSIX
1084 int kvm_assign_set_msix_nr(kvm_context_t kvm,
1085 struct kvm_assigned_msix_nr *msix_nr)
1087 return kvm_vm_ioctl(kvm_state, KVM_ASSIGN_SET_MSIX_NR, msix_nr);
1090 int kvm_assign_set_msix_entry(kvm_context_t kvm,
1091 struct kvm_assigned_msix_entry *entry)
1093 return kvm_vm_ioctl(kvm_state, KVM_ASSIGN_SET_MSIX_ENTRY, entry);
1095 #endif
1097 #if defined(KVM_CAP_IRQFD) && defined(CONFIG_EVENTFD)
1099 #include <sys/eventfd.h>
1101 static int _kvm_irqfd(kvm_context_t kvm, int fd, int gsi, int flags)
1103 struct kvm_irqfd data = {
1104 .fd = fd,
1105 .gsi = gsi,
1106 .flags = flags,
1109 return kvm_vm_ioctl(kvm_state, KVM_IRQFD, &data);
1112 int kvm_irqfd(kvm_context_t kvm, int gsi, int flags)
1114 int r;
1115 int fd;
1117 if (!kvm_check_extension(kvm_state, KVM_CAP_IRQFD))
1118 return -ENOENT;
1120 fd = eventfd(0, 0);
1121 if (fd < 0) {
1122 return -errno;
1125 r = _kvm_irqfd(kvm, fd, gsi, 0);
1126 if (r < 0) {
1127 close(fd);
1128 return -errno;
1131 return fd;
1134 #else /* KVM_CAP_IRQFD */
1136 int kvm_irqfd(kvm_context_t kvm, int gsi, int flags)
1138 return -ENOSYS;
1141 #endif /* KVM_CAP_IRQFD */
1142 unsigned long kvm_get_thread_id(void)
1144 return syscall(SYS_gettid);
1147 static void qemu_cond_wait(pthread_cond_t *cond)
1149 CPUState *env = cpu_single_env;
1151 pthread_cond_wait(cond, &qemu_mutex);
1152 cpu_single_env = env;
1155 static void sig_ipi_handler(int n)
1159 static void hardware_memory_error(void)
1161 fprintf(stderr, "Hardware memory error!\n");
1162 exit(1);
1165 static void sigbus_reraise(void)
1167 sigset_t set;
1168 struct sigaction action;
1170 memset(&action, 0, sizeof(action));
1171 action.sa_handler = SIG_DFL;
1172 if (!sigaction(SIGBUS, &action, NULL)) {
1173 raise(SIGBUS);
1174 sigemptyset(&set);
1175 sigaddset(&set, SIGBUS);
1176 sigprocmask(SIG_UNBLOCK, &set, NULL);
1178 perror("Failed to re-raise SIGBUS!\n");
1179 abort();
1182 static void sigbus_handler(int n, struct qemu_signalfd_siginfo *siginfo,
1183 void *ctx)
1185 #if defined(KVM_CAP_MCE) && defined(TARGET_I386)
1186 if ((first_cpu->mcg_cap & MCG_SER_P) && siginfo->ssi_addr
1187 && siginfo->ssi_code == BUS_MCEERR_AO) {
1188 uint64_t status;
1189 void *vaddr;
1190 ram_addr_t ram_addr;
1191 unsigned long paddr;
1192 //CPUState *cenv;
1194 /* Hope we are lucky for AO MCE */
1195 vaddr = (void *)(intptr_t)siginfo->ssi_addr;
1196 if (qemu_ram_addr_from_host(vaddr, &ram_addr) ||
1197 !kvm_physical_memory_addr_from_ram(kvm_state, ram_addr, (target_phys_addr_t *)&paddr)) {
1198 fprintf(stderr, "Hardware memory error for memory used by "
1199 "QEMU itself instead of guest system!: %llx\n",
1200 (unsigned long long)siginfo->ssi_addr);
1201 return;
1203 status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN
1204 | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S
1205 | 0xc0;
1206 #if 0
1207 kvm_inject_x86_mce(first_cpu, 9, status,
1208 MCG_STATUS_MCIP | MCG_STATUS_RIPV, paddr,
1209 (MCM_ADDR_PHYS << 6) | 0xc, 1);
1210 for (cenv = first_cpu->next_cpu; cenv != NULL; cenv = cenv->next_cpu) {
1211 kvm_inject_x86_mce(cenv, 1, MCI_STATUS_VAL | MCI_STATUS_UC,
1212 MCG_STATUS_MCIP | MCG_STATUS_RIPV, 0, 0, 1);
1214 #endif
1215 } else
1216 #endif
1218 if (siginfo->ssi_code == BUS_MCEERR_AO) {
1219 return;
1220 } else if (siginfo->ssi_code == BUS_MCEERR_AR) {
1221 hardware_memory_error();
1222 } else {
1223 sigbus_reraise();
1228 static void on_vcpu(CPUState *env, void (*func)(void *data), void *data)
1230 struct qemu_work_item wi;
1232 if (env == current_env) {
1233 func(data);
1234 return;
1237 wi.func = func;
1238 wi.data = data;
1239 if (!env->kvm_cpu_state.queued_work_first) {
1240 env->kvm_cpu_state.queued_work_first = &wi;
1241 } else {
1242 env->kvm_cpu_state.queued_work_last->next = &wi;
1244 env->kvm_cpu_state.queued_work_last = &wi;
1245 wi.next = NULL;
1246 wi.done = false;
1248 pthread_kill(env->kvm_cpu_state.thread, SIG_IPI);
1249 while (!wi.done) {
1250 qemu_cond_wait(&qemu_work_cond);
1254 static void do_kvm_cpu_synchronize_state(void *_env)
1256 CPUState *env = _env;
1258 if (!env->kvm_vcpu_dirty) {
1259 kvm_arch_save_regs(env);
1260 env->kvm_vcpu_dirty = 1;
1264 void kvm_cpu_synchronize_state(CPUState *env)
1266 if (!env->kvm_vcpu_dirty) {
1267 on_vcpu(env, do_kvm_cpu_synchronize_state, env);
1271 void kvm_cpu_synchronize_post_reset(CPUState *env)
1273 kvm_arch_load_regs(env, KVM_PUT_RESET_STATE);
1274 env->kvm_vcpu_dirty = 0;
1277 void kvm_cpu_synchronize_post_init(CPUState *env)
1279 kvm_arch_load_regs(env, KVM_PUT_FULL_STATE);
1280 env->kvm_vcpu_dirty = 0;
1283 static void inject_interrupt(void *data)
1285 cpu_interrupt(current_env, (long) data);
1288 void kvm_inject_interrupt(CPUState *env, int mask)
1290 on_vcpu(env, inject_interrupt, (void *) (long) mask);
1293 void kvm_update_interrupt_request(CPUState *env)
1295 int signal = 0;
1297 if (env) {
1298 if (!current_env || !current_env->created) {
1299 signal = 1;
1302 * Testing for created here is really redundant
1304 if (current_env && current_env->created &&
1305 env != current_env && !env->kvm_cpu_state.signalled) {
1306 signal = 1;
1309 if (signal) {
1310 env->kvm_cpu_state.signalled = 1;
1311 if (env->kvm_cpu_state.thread) {
1312 pthread_kill(env->kvm_cpu_state.thread, SIG_IPI);
1318 int kvm_cpu_exec(CPUState *env)
1320 int r;
1322 r = kvm_run(env);
1323 if (r < 0) {
1324 printf("kvm_run returned %d\n", r);
1325 vm_stop(0);
1328 return 0;
1331 int kvm_cpu_is_stopped(CPUState *env)
1333 return !vm_running || env->stopped;
1336 static void flush_queued_work(CPUState *env)
1338 struct qemu_work_item *wi;
1340 if (!env->kvm_cpu_state.queued_work_first) {
1341 return;
1344 while ((wi = env->kvm_cpu_state.queued_work_first)) {
1345 env->kvm_cpu_state.queued_work_first = wi->next;
1346 wi->func(wi->data);
1347 wi->done = true;
1349 env->kvm_cpu_state.queued_work_last = NULL;
1350 pthread_cond_broadcast(&qemu_work_cond);
1353 static int kvm_mce_in_exception(CPUState *env)
1355 struct kvm_msr_entry msr_mcg_status = {
1356 .index = MSR_MCG_STATUS,
1358 int r;
1360 r = kvm_get_msrs(env, &msr_mcg_status, 1);
1361 if (r == -1 || r == 0) {
1362 return -1;
1364 return !!(msr_mcg_status.data & MCG_STATUS_MCIP);
1367 static void kvm_on_sigbus(CPUState *env, siginfo_t *siginfo)
1369 #if defined(KVM_CAP_MCE) && defined(TARGET_I386)
1370 struct kvm_x86_mce mce = {
1371 .bank = 9,
1373 void *vaddr;
1374 ram_addr_t ram_addr;
1375 unsigned long paddr;
1376 int r;
1378 if ((env->mcg_cap & MCG_SER_P) && siginfo->si_addr
1379 && (siginfo->si_code == BUS_MCEERR_AR
1380 || siginfo->si_code == BUS_MCEERR_AO)) {
1381 if (siginfo->si_code == BUS_MCEERR_AR) {
1382 /* Fake an Intel architectural Data Load SRAR UCR */
1383 mce.status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN
1384 | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S
1385 | MCI_STATUS_AR | 0x134;
1386 mce.misc = (MCM_ADDR_PHYS << 6) | 0xc;
1387 mce.mcg_status = MCG_STATUS_MCIP | MCG_STATUS_EIPV;
1388 } else {
1390 * If there is an MCE excpetion being processed, ignore
1391 * this SRAO MCE
1393 r = kvm_mce_in_exception(env);
1394 if (r == -1) {
1395 fprintf(stderr, "Failed to get MCE status\n");
1396 } else if (r) {
1397 return;
1399 /* Fake an Intel architectural Memory scrubbing UCR */
1400 mce.status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN
1401 | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S
1402 | 0xc0;
1403 mce.misc = (MCM_ADDR_PHYS << 6) | 0xc;
1404 mce.mcg_status = MCG_STATUS_MCIP | MCG_STATUS_RIPV;
1406 vaddr = (void *)siginfo->si_addr;
1407 if (qemu_ram_addr_from_host(vaddr, &ram_addr) ||
1408 !kvm_physical_memory_addr_from_ram(kvm_state, ram_addr, (target_phys_addr_t *)&paddr)) {
1409 fprintf(stderr, "Hardware memory error for memory used by "
1410 "QEMU itself instead of guest system!\n");
1411 /* Hope we are lucky for AO MCE */
1412 if (siginfo->si_code == BUS_MCEERR_AO) {
1413 return;
1414 } else {
1415 hardware_memory_error();
1418 mce.addr = paddr;
1419 // r = kvm_set_mce(env, &mce);
1420 r = 0;
1421 if (r < 0) {
1422 fprintf(stderr, "kvm_set_mce: %s\n", strerror(errno));
1423 abort();
1425 } else
1426 #endif
1428 if (siginfo->si_code == BUS_MCEERR_AO) {
1429 return;
1430 } else if (siginfo->si_code == BUS_MCEERR_AR) {
1431 hardware_memory_error();
1432 } else {
1433 sigbus_reraise();
1438 static void kvm_main_loop_wait(CPUState *env, int timeout)
1440 struct timespec ts;
1441 int r, e;
1442 siginfo_t siginfo;
1443 sigset_t waitset;
1444 sigset_t chkset;
1446 ts.tv_sec = timeout / 1000;
1447 ts.tv_nsec = (timeout % 1000) * 1000000;
1448 sigemptyset(&waitset);
1449 sigaddset(&waitset, SIG_IPI);
1450 sigaddset(&waitset, SIGBUS);
1452 do {
1453 pthread_mutex_unlock(&qemu_mutex);
1455 r = sigtimedwait(&waitset, &siginfo, &ts);
1456 e = errno;
1458 pthread_mutex_lock(&qemu_mutex);
1460 if (r == -1 && !(e == EAGAIN || e == EINTR)) {
1461 printf("sigtimedwait: %s\n", strerror(e));
1462 exit(1);
1465 switch (r) {
1466 case SIGBUS:
1467 kvm_on_sigbus(env, &siginfo);
1468 break;
1469 default:
1470 break;
1473 r = sigpending(&chkset);
1474 if (r == -1) {
1475 printf("sigpending: %s\n", strerror(e));
1476 exit(1);
1478 } while (sigismember(&chkset, SIG_IPI) || sigismember(&chkset, SIGBUS));
1480 cpu_single_env = env;
1481 flush_queued_work(env);
1483 if (env->stop) {
1484 env->stop = 0;
1485 env->stopped = 1;
1486 pthread_cond_signal(&qemu_pause_cond);
1489 env->kvm_cpu_state.signalled = 0;
1492 static int all_threads_paused(void)
1494 CPUState *penv = first_cpu;
1496 while (penv) {
1497 if (penv->stop) {
1498 return 0;
1500 penv = (CPUState *) penv->next_cpu;
1503 return 1;
1506 static void pause_all_threads(void)
1508 CPUState *penv = first_cpu;
1510 while (penv) {
1511 if (penv != cpu_single_env) {
1512 penv->stop = 1;
1513 pthread_kill(penv->kvm_cpu_state.thread, SIG_IPI);
1514 } else {
1515 penv->stop = 0;
1516 penv->stopped = 1;
1517 cpu_exit(penv);
1519 penv = (CPUState *) penv->next_cpu;
1522 while (!all_threads_paused()) {
1523 qemu_cond_wait(&qemu_pause_cond);
1527 static void resume_all_threads(void)
1529 CPUState *penv = first_cpu;
1531 assert(!cpu_single_env);
1533 while (penv) {
1534 penv->stop = 0;
1535 penv->stopped = 0;
1536 pthread_kill(penv->kvm_cpu_state.thread, SIG_IPI);
1537 penv = (CPUState *) penv->next_cpu;
1541 static void kvm_vm_state_change_handler(void *context, int running, int reason)
1543 if (running) {
1544 resume_all_threads();
1545 } else {
1546 pause_all_threads();
1550 static void setup_kernel_sigmask(CPUState *env)
1552 sigset_t set;
1554 sigemptyset(&set);
1555 sigaddset(&set, SIGUSR2);
1556 sigaddset(&set, SIGIO);
1557 sigaddset(&set, SIGALRM);
1558 sigprocmask(SIG_BLOCK, &set, NULL);
1560 sigprocmask(SIG_BLOCK, NULL, &set);
1561 sigdelset(&set, SIG_IPI);
1562 sigdelset(&set, SIGBUS);
1564 kvm_set_signal_mask(env, &set);
1567 static void qemu_kvm_system_reset(void)
1569 pause_all_threads();
1571 qemu_system_reset();
1573 resume_all_threads();
1576 static void process_irqchip_events(CPUState *env)
1578 kvm_arch_process_irqchip_events(env);
1579 if (kvm_arch_has_work(env))
1580 env->halted = 0;
1583 static int kvm_main_loop_cpu(CPUState *env)
1585 while (1) {
1586 int run_cpu = !kvm_cpu_is_stopped(env);
1587 if (run_cpu && !kvm_irqchip_in_kernel()) {
1588 process_irqchip_events(env);
1589 run_cpu = !env->halted;
1591 if (run_cpu) {
1592 kvm_cpu_exec(env);
1593 kvm_main_loop_wait(env, 0);
1594 } else {
1595 kvm_main_loop_wait(env, 1000);
1598 pthread_mutex_unlock(&qemu_mutex);
1599 return 0;
1602 static void *ap_main_loop(void *_env)
1604 CPUState *env = _env;
1605 sigset_t signals;
1606 #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
1607 struct ioperm_data *data = NULL;
1608 #endif
1610 current_env = env;
1611 env->thread_id = kvm_get_thread_id();
1612 sigfillset(&signals);
1613 sigprocmask(SIG_BLOCK, &signals, NULL);
1615 #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
1616 /* do ioperm for io ports of assigned devices */
1617 QLIST_FOREACH(data, &ioperm_head, entries)
1618 on_vcpu(env, kvm_arch_do_ioperm, data);
1619 #endif
1621 pthread_mutex_lock(&qemu_mutex);
1622 cpu_single_env = env;
1624 kvm_create_vcpu(env, env->cpu_index);
1625 setup_kernel_sigmask(env);
1627 /* signal VCPU creation */
1628 current_env->created = 1;
1629 pthread_cond_signal(&qemu_vcpu_cond);
1631 /* and wait for machine initialization */
1632 while (!qemu_system_ready) {
1633 qemu_cond_wait(&qemu_system_cond);
1636 /* re-initialize cpu_single_env after re-acquiring qemu_mutex */
1637 cpu_single_env = env;
1639 kvm_main_loop_cpu(env);
1640 return NULL;
1643 int kvm_init_vcpu(CPUState *env)
1645 pthread_create(&env->kvm_cpu_state.thread, NULL, ap_main_loop, env);
1647 while (env->created == 0) {
1648 qemu_cond_wait(&qemu_vcpu_cond);
1651 return 0;
1654 int kvm_vcpu_inited(CPUState *env)
1656 return env->created;
1659 #ifdef TARGET_I386
1660 void kvm_hpet_disable_kpit(void)
1662 struct kvm_pit_state2 ps2;
1664 kvm_get_pit2(kvm_context, &ps2);
1665 ps2.flags |= KVM_PIT_FLAGS_HPET_LEGACY;
1666 kvm_set_pit2(kvm_context, &ps2);
1669 void kvm_hpet_enable_kpit(void)
1671 struct kvm_pit_state2 ps2;
1673 kvm_get_pit2(kvm_context, &ps2);
1674 ps2.flags &= ~KVM_PIT_FLAGS_HPET_LEGACY;
1675 kvm_set_pit2(kvm_context, &ps2);
1677 #endif
1679 int kvm_init_ap(void)
1681 struct sigaction action;
1683 qemu_add_vm_change_state_handler(kvm_vm_state_change_handler, NULL);
1685 signal(SIG_IPI, sig_ipi_handler);
1687 memset(&action, 0, sizeof(action));
1688 action.sa_flags = SA_SIGINFO;
1689 action.sa_sigaction = (void (*)(int, siginfo_t*, void*))sigbus_handler;
1690 sigaction(SIGBUS, &action, NULL);
1691 prctl(PR_MCE_KILL, 1, 1, 0, 0);
1692 return 0;
1695 /* If we have signalfd, we mask out the signals we want to handle and then
1696 * use signalfd to listen for them. We rely on whatever the current signal
1697 * handler is to dispatch the signals when we receive them.
1700 static void sigfd_handler(void *opaque)
1702 int fd = (unsigned long) opaque;
1703 struct qemu_signalfd_siginfo info;
1704 struct sigaction action;
1705 ssize_t len;
1707 while (1) {
1708 do {
1709 len = read(fd, &info, sizeof(info));
1710 } while (len == -1 && errno == EINTR);
1712 if (len == -1 && errno == EAGAIN) {
1713 break;
1716 if (len != sizeof(info)) {
1717 printf("read from sigfd returned %zd: %m\n", len);
1718 return;
1721 sigaction(info.ssi_signo, NULL, &action);
1722 if ((action.sa_flags & SA_SIGINFO) && action.sa_sigaction) {
1723 action.sa_sigaction(info.ssi_signo,
1724 (siginfo_t *)&info, NULL);
1725 } else if (action.sa_handler) {
1726 action.sa_handler(info.ssi_signo);
1731 int kvm_main_loop(void)
1733 sigset_t mask;
1734 int sigfd;
1736 io_thread = pthread_self();
1737 qemu_system_ready = 1;
1739 sigemptyset(&mask);
1740 sigaddset(&mask, SIGIO);
1741 sigaddset(&mask, SIGALRM);
1742 sigaddset(&mask, SIGBUS);
1743 sigprocmask(SIG_BLOCK, &mask, NULL);
1745 sigfd = qemu_signalfd(&mask);
1746 if (sigfd == -1) {
1747 fprintf(stderr, "failed to create signalfd\n");
1748 return -errno;
1751 fcntl(sigfd, F_SETFL, O_NONBLOCK);
1753 qemu_set_fd_handler2(sigfd, NULL, sigfd_handler, NULL,
1754 (void *)(unsigned long) sigfd);
1756 pthread_cond_broadcast(&qemu_system_cond);
1758 io_thread_sigfd = sigfd;
1759 cpu_single_env = NULL;
1761 while (1) {
1762 main_loop_wait(0);
1763 if (qemu_shutdown_requested()) {
1764 monitor_protocol_event(QEVENT_SHUTDOWN, NULL);
1765 if (qemu_no_shutdown()) {
1766 vm_stop(0);
1767 } else {
1768 break;
1770 } else if (qemu_powerdown_requested()) {
1771 monitor_protocol_event(QEVENT_POWERDOWN, NULL);
1772 qemu_irq_raise(qemu_system_powerdown);
1773 } else if (qemu_reset_requested()) {
1774 qemu_kvm_system_reset();
1775 } else if (kvm_debug_cpu_requested) {
1776 gdb_set_stop_cpu(kvm_debug_cpu_requested);
1777 vm_stop(EXCP_DEBUG);
1778 kvm_debug_cpu_requested = NULL;
1782 pause_all_threads();
1783 pthread_mutex_unlock(&qemu_mutex);
1785 return 0;
1788 #if !defined(TARGET_I386)
1789 int kvm_arch_init_irq_routing(void)
1791 return 0;
1793 #endif
1795 extern int no_hpet;
1797 static int kvm_create_context(void)
1799 static const char upgrade_note[] =
1800 "Please upgrade to at least kernel 2.6.29 or recent kvm-kmod\n"
1801 "(see http://sourceforge.net/projects/kvm).\n";
1803 int r;
1805 if (!kvm_irqchip) {
1806 kvm_disable_irqchip_creation(kvm_context);
1808 if (!kvm_pit) {
1809 kvm_disable_pit_creation(kvm_context);
1811 if (kvm_create(kvm_context, 0, NULL) < 0) {
1812 kvm_finalize(kvm_state);
1813 return -1;
1815 r = kvm_arch_qemu_create_context();
1816 if (r < 0) {
1817 kvm_finalize(kvm_state);
1818 return -1;
1820 if (kvm_pit && !kvm_pit_reinject) {
1821 if (kvm_reinject_control(kvm_context, 0)) {
1822 fprintf(stderr, "failure to disable in-kernel PIT reinjection\n");
1823 return -1;
1827 /* There was a nasty bug in < kvm-80 that prevents memory slots from being
1828 * destroyed properly. Since we rely on this capability, refuse to work
1829 * with any kernel without this capability. */
1830 if (!kvm_check_extension(kvm_state, KVM_CAP_DESTROY_MEMORY_REGION_WORKS)) {
1831 fprintf(stderr,
1832 "KVM kernel module broken (DESTROY_MEMORY_REGION).\n%s",
1833 upgrade_note);
1834 return -EINVAL;
1837 r = kvm_arch_init_irq_routing();
1838 if (r < 0) {
1839 return r;
1842 kvm_state->vcpu_events = 0;
1843 #ifdef KVM_CAP_VCPU_EVENTS
1844 kvm_state->vcpu_events = kvm_check_extension(kvm_state, KVM_CAP_VCPU_EVENTS);
1845 #endif
1847 kvm_state->debugregs = 0;
1848 #ifdef KVM_CAP_DEBUGREGS
1849 kvm_state->debugregs = kvm_check_extension(kvm_state, KVM_CAP_DEBUGREGS);
1850 #endif
1852 kvm_init_ap();
1853 if (kvm_irqchip) {
1854 if (!qemu_kvm_has_gsi_routing()) {
1855 irq0override = 0;
1856 #ifdef TARGET_I386
1857 /* if kernel can't do irq routing, interrupt source
1858 * override 0->2 can not be set up as required by hpet,
1859 * so disable hpet.
1861 no_hpet = 1;
1862 } else if (!qemu_kvm_has_pit_state2()) {
1863 no_hpet = 1;
1865 #else
1867 #endif
1870 return 0;
1873 #ifdef KVM_CAP_IRQCHIP
1875 int kvm_set_irq(int irq, int level, int *status)
1877 return kvm_set_irq_level(kvm_context, irq, level, status);
1880 #endif
1882 static void kvm_mutex_unlock(void)
1884 assert(!cpu_single_env);
1885 pthread_mutex_unlock(&qemu_mutex);
1888 static void kvm_mutex_lock(void)
1890 pthread_mutex_lock(&qemu_mutex);
1891 cpu_single_env = NULL;
1894 void qemu_mutex_unlock_iothread(void)
1896 if (kvm_enabled()) {
1897 kvm_mutex_unlock();
1901 void qemu_mutex_lock_iothread(void)
1903 if (kvm_enabled()) {
1904 kvm_mutex_lock();
1908 #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
1909 void kvm_add_ioperm_data(struct ioperm_data *data)
1911 QLIST_INSERT_HEAD(&ioperm_head, data, entries);
1914 void kvm_remove_ioperm_data(unsigned long start_port, unsigned long num)
1916 struct ioperm_data *data;
1918 data = QLIST_FIRST(&ioperm_head);
1919 while (data) {
1920 struct ioperm_data *next = QLIST_NEXT(data, entries);
1922 if (data->start_port == start_port && data->num == num) {
1923 QLIST_REMOVE(data, entries);
1924 qemu_free(data);
1927 data = next;
1931 void kvm_ioperm(CPUState *env, void *data)
1933 if (kvm_enabled() && qemu_system_ready) {
1934 on_vcpu(env, kvm_arch_do_ioperm, data);
1938 #endif
1940 int kvm_set_boot_cpu_id(uint32_t id)
1942 return kvm_set_boot_vcpu_id(kvm_context, id);