Merge branch 'upstream-merge'
[qemu-kvm/stefanha.git] / qemu-kvm.c
blob4f7cf6de372df9c02e01e163f276c96fb5d0d5ba
1 /*
2 * qemu/kvm integration
4 * Copyright (C) 2006-2008 Qumranet Technologies
6 * Licensed under the terms of the GNU GPL version 2 or higher.
7 */
8 #include "config.h"
9 #include "config-host.h"
11 #include <assert.h>
12 #include <string.h>
13 #include "hw/hw.h"
14 #include "sysemu.h"
15 #include "qemu-common.h"
16 #include "console.h"
17 #include "block.h"
18 #include "compatfd.h"
19 #include "gdbstub.h"
20 #include "monitor.h"
22 #include "qemu-kvm.h"
23 #include "libkvm.h"
25 #include <pthread.h>
26 #include <sys/utsname.h>
27 #include <sys/syscall.h>
28 #include <sys/mman.h>
29 #include <sys/ioctl.h>
30 #include "compatfd.h"
31 #include <sys/prctl.h>
33 #define false 0
34 #define true 1
36 #ifndef PR_MCE_KILL
37 #define PR_MCE_KILL 33
38 #endif
40 #ifndef BUS_MCEERR_AR
41 #define BUS_MCEERR_AR 4
42 #endif
43 #ifndef BUS_MCEERR_AO
44 #define BUS_MCEERR_AO 5
45 #endif
47 #define EXPECTED_KVM_API_VERSION 12
49 #if EXPECTED_KVM_API_VERSION != KVM_API_VERSION
50 #error libkvm: userspace and kernel version mismatch
51 #endif
53 int kvm_irqchip = 1;
54 int kvm_pit = 1;
55 int kvm_pit_reinject = 1;
56 int kvm_nested = 0;
59 KVMState *kvm_state;
60 kvm_context_t kvm_context;
62 pthread_mutex_t qemu_mutex = PTHREAD_MUTEX_INITIALIZER;
63 pthread_cond_t qemu_vcpu_cond = PTHREAD_COND_INITIALIZER;
64 pthread_cond_t qemu_system_cond = PTHREAD_COND_INITIALIZER;
65 pthread_cond_t qemu_pause_cond = PTHREAD_COND_INITIALIZER;
66 pthread_cond_t qemu_work_cond = PTHREAD_COND_INITIALIZER;
67 __thread CPUState *current_env;
69 static int qemu_system_ready;
71 #define SIG_IPI (SIGRTMIN+4)
73 pthread_t io_thread;
74 static int io_thread_fd = -1;
75 static int io_thread_sigfd = -1;
77 static CPUState *kvm_debug_cpu_requested;
79 #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
80 /* The list of ioperm_data */
81 static QLIST_HEAD(, ioperm_data) ioperm_head;
82 #endif
84 #define ALIGN(x, y) (((x)+(y)-1) & ~((y)-1))
86 int kvm_abi = EXPECTED_KVM_API_VERSION;
87 int kvm_page_size;
89 #ifdef KVM_CAP_SET_GUEST_DEBUG
90 static int kvm_debug(CPUState *env,
91 struct kvm_debug_exit_arch *arch_info)
93 int handle = kvm_arch_debug(arch_info);
95 if (handle) {
96 kvm_debug_cpu_requested = env;
97 env->stopped = 1;
99 return handle;
101 #endif
103 static int handle_unhandled(uint64_t reason)
105 fprintf(stderr, "kvm: unhandled exit %" PRIx64 "\n", reason);
106 return -EINVAL;
109 #define VMX_INVALID_GUEST_STATE 0x80000021
111 static int handle_failed_vmentry(uint64_t reason)
113 fprintf(stderr, "kvm: vm entry failed with error 0x%" PRIx64 "\n\n", reason);
115 /* Perhaps we will need to check if this machine is intel since exit reason 0x21
116 has a different interpretation on SVM */
117 if (reason == VMX_INVALID_GUEST_STATE) {
118 fprintf(stderr, "If you're runnning a guest on an Intel machine without\n");
119 fprintf(stderr, "unrestricted mode support, the failure can be most likely\n");
120 fprintf(stderr, "due to the guest entering an invalid state for Intel VT.\n");
121 fprintf(stderr, "For example, the guest maybe running in big real mode\n");
122 fprintf(stderr, "which is not supported on less recent Intel processors.\n\n");
125 return -EINVAL;
128 static inline void set_gsi(kvm_context_t kvm, unsigned int gsi)
130 uint32_t *bitmap = kvm->used_gsi_bitmap;
132 if (gsi < kvm->max_gsi)
133 bitmap[gsi / 32] |= 1U << (gsi % 32);
134 else
135 DPRINTF("Invalid GSI %u\n", gsi);
138 static inline void clear_gsi(kvm_context_t kvm, unsigned int gsi)
140 uint32_t *bitmap = kvm->used_gsi_bitmap;
142 if (gsi < kvm->max_gsi)
143 bitmap[gsi / 32] &= ~(1U << (gsi % 32));
144 else
145 DPRINTF("Invalid GSI %u\n", gsi);
148 static int kvm_create_context(void);
150 int kvm_init(int smp_cpus)
152 int fd;
153 int r, gsi_count;
156 fd = open("/dev/kvm", O_RDWR);
157 if (fd == -1) {
158 perror("open /dev/kvm");
159 return -1;
161 r = ioctl(fd, KVM_GET_API_VERSION, 0);
162 if (r == -1) {
163 fprintf(stderr,
164 "kvm kernel version too old: "
165 "KVM_GET_API_VERSION ioctl not supported\n");
166 goto out_close;
168 if (r < EXPECTED_KVM_API_VERSION) {
169 fprintf(stderr, "kvm kernel version too old: "
170 "We expect API version %d or newer, but got "
171 "version %d\n", EXPECTED_KVM_API_VERSION, r);
172 goto out_close;
174 if (r > EXPECTED_KVM_API_VERSION) {
175 fprintf(stderr, "kvm userspace version too old\n");
176 goto out_close;
178 kvm_abi = r;
179 kvm_page_size = getpagesize();
180 kvm_state = qemu_mallocz(sizeof(*kvm_state));
181 kvm_context = &kvm_state->kvm_context;
183 kvm_state->fd = fd;
184 kvm_state->vmfd = -1;
185 kvm_context->opaque = cpu_single_env;
186 kvm_context->dirty_pages_log_all = 0;
187 kvm_context->no_irqchip_creation = 0;
188 kvm_context->no_pit_creation = 0;
190 #ifdef KVM_CAP_SET_GUEST_DEBUG
191 QTAILQ_INIT(&kvm_state->kvm_sw_breakpoints);
192 #endif
194 gsi_count = kvm_get_gsi_count(kvm_context);
195 if (gsi_count > 0) {
196 int gsi_bits, i;
198 /* Round up so we can search ints using ffs */
199 gsi_bits = ALIGN(gsi_count, 32);
200 kvm_context->used_gsi_bitmap = qemu_mallocz(gsi_bits / 8);
201 kvm_context->max_gsi = gsi_bits;
203 /* Mark any over-allocated bits as already in use */
204 for (i = gsi_count; i < gsi_bits; i++)
205 set_gsi(kvm_context, i);
208 kvm_cpu_register_phys_memory_client();
210 pthread_mutex_lock(&qemu_mutex);
211 return kvm_create_context();
213 out_close:
214 close(fd);
215 return -1;
218 static void kvm_finalize(KVMState *s)
220 /* FIXME
221 if (kvm->vcpu_fd[0] != -1)
222 close(kvm->vcpu_fd[0]);
223 if (kvm->vm_fd != -1)
224 close(kvm->vm_fd);
226 close(s->fd);
227 free(s);
230 void kvm_disable_irqchip_creation(kvm_context_t kvm)
232 kvm->no_irqchip_creation = 1;
235 void kvm_disable_pit_creation(kvm_context_t kvm)
237 kvm->no_pit_creation = 1;
240 static void kvm_reset_vcpu(void *opaque)
242 CPUState *env = opaque;
244 kvm_arch_cpu_reset(env);
247 static void kvm_create_vcpu(CPUState *env, int id)
249 long mmap_size;
250 int r;
251 KVMState *s = kvm_state;
253 r = kvm_vm_ioctl(kvm_state, KVM_CREATE_VCPU, id);
254 if (r < 0) {
255 fprintf(stderr, "kvm_create_vcpu: %m\n");
256 fprintf(stderr, "Failed to create vCPU. Check the -smp parameter.\n");
257 goto err;
260 env->kvm_fd = r;
261 env->kvm_state = kvm_state;
263 mmap_size = kvm_ioctl(kvm_state, KVM_GET_VCPU_MMAP_SIZE, 0);
264 if (mmap_size < 0) {
265 fprintf(stderr, "get vcpu mmap size: %m\n");
266 goto err_fd;
268 env->kvm_run =
269 mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, env->kvm_fd,
271 if (env->kvm_run == MAP_FAILED) {
272 fprintf(stderr, "mmap vcpu area: %m\n");
273 goto err_fd;
276 #ifdef KVM_CAP_COALESCED_MMIO
277 if (s->coalesced_mmio && !s->coalesced_mmio_ring)
278 s->coalesced_mmio_ring = (void *) env->kvm_run +
279 s->coalesced_mmio * PAGE_SIZE;
280 #endif
282 r = kvm_arch_init_vcpu(env);
283 if (r == 0) {
284 qemu_register_reset(kvm_reset_vcpu, env);
287 return;
288 err_fd:
289 close(env->kvm_fd);
290 err:
291 /* We're no good with semi-broken states. */
292 abort();
295 static int kvm_set_boot_vcpu_id(kvm_context_t kvm, uint32_t id)
297 #ifdef KVM_CAP_SET_BOOT_CPU_ID
298 int r = kvm_ioctl(kvm_state, KVM_CHECK_EXTENSION, KVM_CAP_SET_BOOT_CPU_ID);
299 if (r > 0)
300 return kvm_vm_ioctl(kvm_state, KVM_SET_BOOT_CPU_ID, id);
301 return -ENOSYS;
302 #else
303 return -ENOSYS;
304 #endif
307 int kvm_create_vm(kvm_context_t kvm)
309 int fd;
310 #ifdef KVM_CAP_IRQ_ROUTING
311 kvm->irq_routes = qemu_mallocz(sizeof(*kvm->irq_routes));
312 kvm->nr_allocated_irq_routes = 0;
313 #endif
315 fd = kvm_ioctl(kvm_state, KVM_CREATE_VM, 0);
316 if (fd < 0) {
317 fprintf(stderr, "kvm_create_vm: %m\n");
318 return -1;
320 kvm_state->vmfd = fd;
321 return 0;
324 static int kvm_create_default_phys_mem(kvm_context_t kvm,
325 unsigned long phys_mem_bytes,
326 void **vm_mem)
328 #ifdef KVM_CAP_USER_MEMORY
329 int r = kvm_ioctl(kvm_state, KVM_CHECK_EXTENSION, KVM_CAP_USER_MEMORY);
330 if (r > 0)
331 return 0;
332 fprintf(stderr,
333 "Hypervisor too old: KVM_CAP_USER_MEMORY extension not supported\n");
334 #else
335 #error Hypervisor too old: KVM_CAP_USER_MEMORY extension not supported
336 #endif
337 return -1;
340 void kvm_create_irqchip(kvm_context_t kvm)
342 int r;
344 kvm->irqchip_in_kernel = 0;
345 #ifdef KVM_CAP_IRQCHIP
346 if (!kvm->no_irqchip_creation) {
347 r = kvm_ioctl(kvm_state, KVM_CHECK_EXTENSION, KVM_CAP_IRQCHIP);
348 if (r > 0) { /* kernel irqchip supported */
349 r = kvm_vm_ioctl(kvm_state, KVM_CREATE_IRQCHIP);
350 if (r >= 0) {
351 kvm->irqchip_inject_ioctl = KVM_IRQ_LINE;
352 #if defined(KVM_CAP_IRQ_INJECT_STATUS) && defined(KVM_IRQ_LINE_STATUS)
353 r = kvm_ioctl(kvm_state, KVM_CHECK_EXTENSION,
354 KVM_CAP_IRQ_INJECT_STATUS);
355 if (r > 0)
356 kvm->irqchip_inject_ioctl = KVM_IRQ_LINE_STATUS;
357 #endif
358 kvm->irqchip_in_kernel = 1;
359 } else
360 fprintf(stderr, "Create kernel PIC irqchip failed\n");
363 #endif
364 kvm_state->irqchip_in_kernel = kvm->irqchip_in_kernel;
367 int kvm_create(kvm_context_t kvm, unsigned long phys_mem_bytes, void **vm_mem)
369 int r, i;
371 r = kvm_create_vm(kvm);
372 if (r < 0)
373 return r;
374 r = kvm_arch_create(kvm, phys_mem_bytes, vm_mem);
375 if (r < 0)
376 return r;
377 for (i = 0; i < ARRAY_SIZE(kvm_state->slots); i++)
378 kvm_state->slots[i].slot = i;
380 r = kvm_create_default_phys_mem(kvm, phys_mem_bytes, vm_mem);
381 if (r < 0)
382 return r;
383 kvm_create_irqchip(kvm);
385 return 0;
388 #ifdef KVM_CAP_IRQCHIP
390 int kvm_set_irq_level(kvm_context_t kvm, int irq, int level, int *status)
392 struct kvm_irq_level event;
393 int r;
395 if (!kvm->irqchip_in_kernel)
396 return 0;
397 event.level = level;
398 event.irq = irq;
399 r = kvm_vm_ioctl(kvm_state, kvm->irqchip_inject_ioctl, &event);
400 if (r < 0)
401 perror("kvm_set_irq_level");
403 if (status) {
404 #ifdef KVM_CAP_IRQ_INJECT_STATUS
405 *status =
406 (kvm->irqchip_inject_ioctl == KVM_IRQ_LINE) ? 1 : event.status;
407 #else
408 *status = 1;
409 #endif
412 return 1;
415 int kvm_get_irqchip(kvm_context_t kvm, struct kvm_irqchip *chip)
417 int r;
419 if (!kvm->irqchip_in_kernel)
420 return 0;
421 r = kvm_vm_ioctl(kvm_state, KVM_GET_IRQCHIP, chip);
422 if (r < 0) {
423 perror("kvm_get_irqchip\n");
425 return r;
428 int kvm_set_irqchip(kvm_context_t kvm, struct kvm_irqchip *chip)
430 int r;
432 if (!kvm->irqchip_in_kernel)
433 return 0;
434 r = kvm_vm_ioctl(kvm_state, KVM_SET_IRQCHIP, chip);
435 if (r < 0) {
436 perror("kvm_set_irqchip\n");
438 return r;
441 #endif
443 static int handle_debug(CPUState *env)
445 #ifdef KVM_CAP_SET_GUEST_DEBUG
446 struct kvm_run *run = env->kvm_run;
448 return kvm_debug(env, &run->debug.arch);
449 #else
450 return 0;
451 #endif
454 int kvm_get_regs(CPUState *env, struct kvm_regs *regs)
456 return kvm_vcpu_ioctl(env, KVM_GET_REGS, regs);
459 int kvm_set_regs(CPUState *env, struct kvm_regs *regs)
461 return kvm_vcpu_ioctl(env, KVM_SET_REGS, regs);
464 int kvm_get_fpu(CPUState *env, struct kvm_fpu *fpu)
466 return kvm_vcpu_ioctl(env, KVM_GET_FPU, fpu);
469 int kvm_set_fpu(CPUState *env, struct kvm_fpu *fpu)
471 return kvm_vcpu_ioctl(env, KVM_SET_FPU, fpu);
474 int kvm_get_sregs(CPUState *env, struct kvm_sregs *sregs)
476 return kvm_vcpu_ioctl(env, KVM_GET_SREGS, sregs);
479 int kvm_set_sregs(CPUState *env, struct kvm_sregs *sregs)
481 return kvm_vcpu_ioctl(env, KVM_SET_SREGS, sregs);
484 #ifdef KVM_CAP_MP_STATE
485 int kvm_get_mpstate(CPUState *env, struct kvm_mp_state *mp_state)
487 int r;
489 r = kvm_ioctl(kvm_state, KVM_CHECK_EXTENSION, KVM_CAP_MP_STATE);
490 if (r > 0)
491 return kvm_vcpu_ioctl(env, KVM_GET_MP_STATE, mp_state);
492 return -ENOSYS;
495 int kvm_set_mpstate(CPUState *env, struct kvm_mp_state *mp_state)
497 int r;
499 r = kvm_ioctl(kvm_state, KVM_CHECK_EXTENSION, KVM_CAP_MP_STATE);
500 if (r > 0)
501 return kvm_vcpu_ioctl(env, KVM_SET_MP_STATE, mp_state);
502 return -ENOSYS;
504 #endif
506 #ifdef KVM_CAP_XSAVE
507 int kvm_get_xsave(CPUState *env, struct kvm_xsave *xsave)
509 return kvm_vcpu_ioctl(env, KVM_GET_XSAVE, xsave);
512 int kvm_set_xsave(CPUState *env, struct kvm_xsave *xsave)
514 return kvm_vcpu_ioctl(env, KVM_SET_XSAVE, xsave);
516 #endif
518 #ifdef KVM_CAP_XCRS
519 int kvm_get_xcrs(CPUState *env, struct kvm_xcrs *xcrs)
521 return kvm_vcpu_ioctl(env, KVM_GET_XCRS, xcrs);
524 int kvm_set_xcrs(CPUState *env, struct kvm_xcrs *xcrs)
526 return kvm_vcpu_ioctl(env, KVM_SET_XCRS, xcrs);
528 #endif
530 static int handle_mmio(CPUState *env)
532 unsigned long addr = env->kvm_run->mmio.phys_addr;
533 struct kvm_run *kvm_run = env->kvm_run;
534 void *data = kvm_run->mmio.data;
536 /* hack: Red Hat 7.1 generates these weird accesses. */
537 if ((addr > 0xa0000 - 4 && addr <= 0xa0000) && kvm_run->mmio.len == 3)
538 return 0;
540 cpu_physical_memory_rw(addr, data, kvm_run->mmio.len, kvm_run->mmio.is_write);
541 return 0;
544 int handle_io_window(kvm_context_t kvm)
546 return 1;
549 int handle_shutdown(kvm_context_t kvm, CPUState *env)
551 /* stop the current vcpu from going back to guest mode */
552 env->stopped = 1;
554 qemu_system_reset_request();
555 return 1;
558 static inline void push_nmi(kvm_context_t kvm)
560 #ifdef KVM_CAP_USER_NMI
561 kvm_arch_push_nmi(kvm->opaque);
562 #endif /* KVM_CAP_USER_NMI */
565 void post_kvm_run(kvm_context_t kvm, CPUState *env)
567 pthread_mutex_lock(&qemu_mutex);
568 kvm_arch_post_run(env, env->kvm_run);
569 cpu_single_env = env;
572 int pre_kvm_run(kvm_context_t kvm, CPUState *env)
574 kvm_arch_pre_run(env, env->kvm_run);
576 pthread_mutex_unlock(&qemu_mutex);
577 return 0;
580 int kvm_is_ready_for_interrupt_injection(CPUState *env)
582 return env->kvm_run->ready_for_interrupt_injection;
585 int kvm_run(CPUState *env)
587 int r;
588 kvm_context_t kvm = &env->kvm_state->kvm_context;
589 struct kvm_run *run = env->kvm_run;
590 int fd = env->kvm_fd;
592 again:
593 if (env->kvm_vcpu_dirty) {
594 kvm_arch_load_regs(env, KVM_PUT_RUNTIME_STATE);
595 env->kvm_vcpu_dirty = 0;
597 push_nmi(kvm);
598 #if !defined(__s390__)
599 if (!kvm->irqchip_in_kernel)
600 run->request_interrupt_window = kvm_arch_try_push_interrupts(env);
601 #endif
603 r = pre_kvm_run(kvm, env);
604 if (r)
605 return r;
606 if (env->exit_request) {
607 env->exit_request = 0;
608 pthread_kill(env->kvm_cpu_state.thread, SIG_IPI);
610 r = ioctl(fd, KVM_RUN, 0);
612 if (r == -1 && errno != EINTR && errno != EAGAIN) {
613 r = -errno;
614 post_kvm_run(kvm, env);
615 fprintf(stderr, "kvm_run: %s\n", strerror(-r));
616 return r;
619 post_kvm_run(kvm, env);
621 kvm_flush_coalesced_mmio_buffer();
623 #if !defined(__s390__)
624 if (r == -1) {
625 r = handle_io_window(kvm);
626 goto more;
628 #endif
629 if (1) {
630 switch (run->exit_reason) {
631 case KVM_EXIT_UNKNOWN:
632 r = handle_unhandled(run->hw.hardware_exit_reason);
633 break;
634 case KVM_EXIT_FAIL_ENTRY:
635 r = handle_failed_vmentry(run->fail_entry.hardware_entry_failure_reason);
636 break;
637 case KVM_EXIT_EXCEPTION:
638 fprintf(stderr, "exception %d (%x)\n", run->ex.exception,
639 run->ex.error_code);
640 kvm_show_regs(env);
641 kvm_show_code(env);
642 abort();
643 break;
644 case KVM_EXIT_IO:
645 r = kvm_handle_io(run->io.port,
646 (uint8_t *)run + run->io.data_offset,
647 run->io.direction,
648 run->io.size,
649 run->io.count);
650 r = 0;
651 break;
652 case KVM_EXIT_DEBUG:
653 r = handle_debug(env);
654 break;
655 case KVM_EXIT_MMIO:
656 r = handle_mmio(env);
657 break;
658 case KVM_EXIT_HLT:
659 r = kvm_arch_halt(env);
660 break;
661 case KVM_EXIT_IRQ_WINDOW_OPEN:
662 break;
663 case KVM_EXIT_SHUTDOWN:
664 r = handle_shutdown(kvm, env);
665 break;
666 #if defined(__s390__)
667 case KVM_EXIT_S390_SIEIC:
668 r = kvm_s390_handle_intercept(kvm, env, run);
669 break;
670 case KVM_EXIT_S390_RESET:
671 r = kvm_s390_handle_reset(kvm, env, run);
672 break;
673 #endif
674 case KVM_EXIT_INTERNAL_ERROR:
675 kvm_handle_internal_error(env, run);
676 r = 1;
677 break;
678 default:
679 if (kvm_arch_run(env)) {
680 fprintf(stderr, "unhandled vm exit: 0x%x\n", run->exit_reason);
681 kvm_show_regs(env);
682 abort();
684 break;
687 more:
688 if (!r)
689 goto again;
690 return r;
693 int kvm_inject_irq(CPUState *env, unsigned irq)
695 struct kvm_interrupt intr;
697 intr.irq = irq;
698 return kvm_vcpu_ioctl(env, KVM_INTERRUPT, &intr);
701 int kvm_inject_nmi(CPUState *env)
703 #ifdef KVM_CAP_USER_NMI
704 return kvm_vcpu_ioctl(env, KVM_NMI);
705 #else
706 return -ENOSYS;
707 #endif
710 int kvm_init_coalesced_mmio(kvm_context_t kvm)
712 int r = 0;
713 kvm_state->coalesced_mmio = 0;
714 #ifdef KVM_CAP_COALESCED_MMIO
715 r = kvm_ioctl(kvm_state, KVM_CHECK_EXTENSION, KVM_CAP_COALESCED_MMIO);
716 if (r > 0) {
717 kvm_state->coalesced_mmio = r;
718 return 0;
720 #endif
721 return r;
724 #ifdef KVM_CAP_DEVICE_ASSIGNMENT
725 int kvm_assign_pci_device(kvm_context_t kvm,
726 struct kvm_assigned_pci_dev *assigned_dev)
728 return kvm_vm_ioctl(kvm_state, KVM_ASSIGN_PCI_DEVICE, assigned_dev);
731 static int kvm_old_assign_irq(kvm_context_t kvm,
732 struct kvm_assigned_irq *assigned_irq)
734 return kvm_vm_ioctl(kvm_state, KVM_ASSIGN_IRQ, assigned_irq);
737 #ifdef KVM_CAP_ASSIGN_DEV_IRQ
738 int kvm_assign_irq(kvm_context_t kvm, struct kvm_assigned_irq *assigned_irq)
740 int ret;
742 ret = kvm_ioctl(kvm_state, KVM_CHECK_EXTENSION, KVM_CAP_ASSIGN_DEV_IRQ);
743 if (ret > 0) {
744 return kvm_vm_ioctl(kvm_state, KVM_ASSIGN_DEV_IRQ, assigned_irq);
747 return kvm_old_assign_irq(kvm, assigned_irq);
750 int kvm_deassign_irq(kvm_context_t kvm, struct kvm_assigned_irq *assigned_irq)
752 return kvm_vm_ioctl(kvm_state, KVM_DEASSIGN_DEV_IRQ, assigned_irq);
754 #else
755 int kvm_assign_irq(kvm_context_t kvm, struct kvm_assigned_irq *assigned_irq)
757 return kvm_old_assign_irq(kvm, assigned_irq);
759 #endif
760 #endif
762 #ifdef KVM_CAP_DEVICE_DEASSIGNMENT
763 int kvm_deassign_pci_device(kvm_context_t kvm,
764 struct kvm_assigned_pci_dev *assigned_dev)
766 return kvm_vm_ioctl(kvm_state, KVM_DEASSIGN_PCI_DEVICE, assigned_dev);
768 #endif
770 int kvm_reinject_control(kvm_context_t kvm, int pit_reinject)
772 #ifdef KVM_CAP_REINJECT_CONTROL
773 int r;
774 struct kvm_reinject_control control;
776 control.pit_reinject = pit_reinject;
778 r = kvm_ioctl(kvm_state, KVM_CHECK_EXTENSION, KVM_CAP_REINJECT_CONTROL);
779 if (r > 0) {
780 return kvm_vm_ioctl(kvm_state, KVM_REINJECT_CONTROL, &control);
782 #endif
783 return -ENOSYS;
786 int kvm_has_gsi_routing(kvm_context_t kvm)
788 int r = 0;
790 #ifdef KVM_CAP_IRQ_ROUTING
791 r = kvm_check_extension(kvm_state, KVM_CAP_IRQ_ROUTING);
792 #endif
793 return r;
796 int kvm_get_gsi_count(kvm_context_t kvm)
798 #ifdef KVM_CAP_IRQ_ROUTING
799 return kvm_check_extension(kvm_state, KVM_CAP_IRQ_ROUTING);
800 #else
801 return -EINVAL;
802 #endif
805 int kvm_clear_gsi_routes(kvm_context_t kvm)
807 #ifdef KVM_CAP_IRQ_ROUTING
808 kvm->irq_routes->nr = 0;
809 return 0;
810 #else
811 return -EINVAL;
812 #endif
815 int kvm_add_routing_entry(kvm_context_t kvm,
816 struct kvm_irq_routing_entry *entry)
818 #ifdef KVM_CAP_IRQ_ROUTING
819 struct kvm_irq_routing *z;
820 struct kvm_irq_routing_entry *new;
821 int n, size;
823 if (kvm->irq_routes->nr == kvm->nr_allocated_irq_routes) {
824 n = kvm->nr_allocated_irq_routes * 2;
825 if (n < 64)
826 n = 64;
827 size = sizeof(struct kvm_irq_routing);
828 size += n * sizeof(*new);
829 z = realloc(kvm->irq_routes, size);
830 if (!z)
831 return -ENOMEM;
832 kvm->nr_allocated_irq_routes = n;
833 kvm->irq_routes = z;
835 n = kvm->irq_routes->nr++;
836 new = &kvm->irq_routes->entries[n];
837 memset(new, 0, sizeof(*new));
838 new->gsi = entry->gsi;
839 new->type = entry->type;
840 new->flags = entry->flags;
841 new->u = entry->u;
843 set_gsi(kvm, entry->gsi);
845 return 0;
846 #else
847 return -ENOSYS;
848 #endif
851 int kvm_add_irq_route(kvm_context_t kvm, int gsi, int irqchip, int pin)
853 #ifdef KVM_CAP_IRQ_ROUTING
854 struct kvm_irq_routing_entry e;
856 e.gsi = gsi;
857 e.type = KVM_IRQ_ROUTING_IRQCHIP;
858 e.flags = 0;
859 e.u.irqchip.irqchip = irqchip;
860 e.u.irqchip.pin = pin;
861 return kvm_add_routing_entry(kvm, &e);
862 #else
863 return -ENOSYS;
864 #endif
867 int kvm_del_routing_entry(kvm_context_t kvm,
868 struct kvm_irq_routing_entry *entry)
870 #ifdef KVM_CAP_IRQ_ROUTING
871 struct kvm_irq_routing_entry *e, *p;
872 int i, gsi, found = 0;
874 gsi = entry->gsi;
876 for (i = 0; i < kvm->irq_routes->nr; ++i) {
877 e = &kvm->irq_routes->entries[i];
878 if (e->type == entry->type && e->gsi == gsi) {
879 switch (e->type) {
880 case KVM_IRQ_ROUTING_IRQCHIP:{
881 if (e->u.irqchip.irqchip ==
882 entry->u.irqchip.irqchip
883 && e->u.irqchip.pin == entry->u.irqchip.pin) {
884 p = &kvm->irq_routes->entries[--kvm->irq_routes->nr];
885 *e = *p;
886 found = 1;
888 break;
890 case KVM_IRQ_ROUTING_MSI:{
891 if (e->u.msi.address_lo ==
892 entry->u.msi.address_lo
893 && e->u.msi.address_hi ==
894 entry->u.msi.address_hi
895 && e->u.msi.data == entry->u.msi.data) {
896 p = &kvm->irq_routes->entries[--kvm->irq_routes->nr];
897 *e = *p;
898 found = 1;
900 break;
902 default:
903 break;
905 if (found) {
906 /* If there are no other users of this GSI
907 * mark it available in the bitmap */
908 for (i = 0; i < kvm->irq_routes->nr; i++) {
909 e = &kvm->irq_routes->entries[i];
910 if (e->gsi == gsi)
911 break;
913 if (i == kvm->irq_routes->nr)
914 clear_gsi(kvm, gsi);
916 return 0;
920 return -ESRCH;
921 #else
922 return -ENOSYS;
923 #endif
926 int kvm_update_routing_entry(kvm_context_t kvm,
927 struct kvm_irq_routing_entry *entry,
928 struct kvm_irq_routing_entry *newentry)
930 #ifdef KVM_CAP_IRQ_ROUTING
931 struct kvm_irq_routing_entry *e;
932 int i;
934 if (entry->gsi != newentry->gsi || entry->type != newentry->type) {
935 return -EINVAL;
938 for (i = 0; i < kvm->irq_routes->nr; ++i) {
939 e = &kvm->irq_routes->entries[i];
940 if (e->type != entry->type || e->gsi != entry->gsi) {
941 continue;
943 switch (e->type) {
944 case KVM_IRQ_ROUTING_IRQCHIP:
945 if (e->u.irqchip.irqchip == entry->u.irqchip.irqchip &&
946 e->u.irqchip.pin == entry->u.irqchip.pin) {
947 memcpy(&e->u.irqchip, &newentry->u.irqchip,
948 sizeof e->u.irqchip);
949 return 0;
951 break;
952 case KVM_IRQ_ROUTING_MSI:
953 if (e->u.msi.address_lo == entry->u.msi.address_lo &&
954 e->u.msi.address_hi == entry->u.msi.address_hi &&
955 e->u.msi.data == entry->u.msi.data) {
956 memcpy(&e->u.msi, &newentry->u.msi, sizeof e->u.msi);
957 return 0;
959 break;
960 default:
961 break;
964 return -ESRCH;
965 #else
966 return -ENOSYS;
967 #endif
970 int kvm_del_irq_route(kvm_context_t kvm, int gsi, int irqchip, int pin)
972 #ifdef KVM_CAP_IRQ_ROUTING
973 struct kvm_irq_routing_entry e;
975 e.gsi = gsi;
976 e.type = KVM_IRQ_ROUTING_IRQCHIP;
977 e.flags = 0;
978 e.u.irqchip.irqchip = irqchip;
979 e.u.irqchip.pin = pin;
980 return kvm_del_routing_entry(kvm, &e);
981 #else
982 return -ENOSYS;
983 #endif
986 int kvm_commit_irq_routes(kvm_context_t kvm)
988 #ifdef KVM_CAP_IRQ_ROUTING
989 kvm->irq_routes->flags = 0;
990 return kvm_vm_ioctl(kvm_state, KVM_SET_GSI_ROUTING, kvm->irq_routes);
991 #else
992 return -ENOSYS;
993 #endif
996 int kvm_get_irq_route_gsi(kvm_context_t kvm)
998 int i, bit;
999 uint32_t *buf = kvm->used_gsi_bitmap;
1001 /* Return the lowest unused GSI in the bitmap */
1002 for (i = 0; i < kvm->max_gsi / 32; i++) {
1003 bit = ffs(~buf[i]);
1004 if (!bit)
1005 continue;
1007 return bit - 1 + i * 32;
1010 return -ENOSPC;
1013 #ifdef KVM_CAP_DEVICE_MSIX
1014 int kvm_assign_set_msix_nr(kvm_context_t kvm,
1015 struct kvm_assigned_msix_nr *msix_nr)
1017 return kvm_vm_ioctl(kvm_state, KVM_ASSIGN_SET_MSIX_NR, msix_nr);
1020 int kvm_assign_set_msix_entry(kvm_context_t kvm,
1021 struct kvm_assigned_msix_entry *entry)
1023 return kvm_vm_ioctl(kvm_state, KVM_ASSIGN_SET_MSIX_ENTRY, entry);
1025 #endif
1027 #if defined(KVM_CAP_IRQFD) && defined(CONFIG_EVENTFD)
1029 #include <sys/eventfd.h>
1031 static int _kvm_irqfd(kvm_context_t kvm, int fd, int gsi, int flags)
1033 struct kvm_irqfd data = {
1034 .fd = fd,
1035 .gsi = gsi,
1036 .flags = flags,
1039 return kvm_vm_ioctl(kvm_state, KVM_IRQFD, &data);
1042 int kvm_irqfd(kvm_context_t kvm, int gsi, int flags)
1044 int r;
1045 int fd;
1047 if (!kvm_check_extension(kvm_state, KVM_CAP_IRQFD))
1048 return -ENOENT;
1050 fd = eventfd(0, 0);
1051 if (fd < 0)
1052 return -errno;
1054 r = _kvm_irqfd(kvm, fd, gsi, 0);
1055 if (r < 0) {
1056 close(fd);
1057 return -errno;
1060 return fd;
1063 #else /* KVM_CAP_IRQFD */
1065 int kvm_irqfd(kvm_context_t kvm, int gsi, int flags)
1067 return -ENOSYS;
1070 #endif /* KVM_CAP_IRQFD */
1071 unsigned long kvm_get_thread_id(void)
1073 return syscall(SYS_gettid);
1076 static void qemu_cond_wait(pthread_cond_t *cond)
1078 CPUState *env = cpu_single_env;
1080 pthread_cond_wait(cond, &qemu_mutex);
1081 cpu_single_env = env;
1084 static void sig_ipi_handler(int n)
1088 static void hardware_memory_error(void)
1090 fprintf(stderr, "Hardware memory error!\n");
1091 exit(1);
1094 static void sigbus_reraise(void)
1096 sigset_t set;
1097 struct sigaction action;
1099 memset(&action, 0, sizeof(action));
1100 action.sa_handler = SIG_DFL;
1101 if (!sigaction(SIGBUS, &action, NULL)) {
1102 raise(SIGBUS);
1103 sigemptyset(&set);
1104 sigaddset(&set, SIGBUS);
1105 sigprocmask(SIG_UNBLOCK, &set, NULL);
1107 perror("Failed to re-raise SIGBUS!\n");
1108 abort();
1111 static void sigbus_handler(int n, struct qemu_signalfd_siginfo *siginfo,
1112 void *ctx)
1114 #if defined(KVM_CAP_MCE) && defined(TARGET_I386)
1115 if (first_cpu->mcg_cap && siginfo->ssi_addr
1116 && siginfo->ssi_code == BUS_MCEERR_AO) {
1117 uint64_t status;
1118 unsigned long paddr;
1119 CPUState *cenv;
1121 /* Hope we are lucky for AO MCE */
1122 if (do_qemu_ram_addr_from_host((void *)(intptr_t)siginfo->ssi_addr,
1123 &paddr)) {
1124 fprintf(stderr, "Hardware memory error for memory used by "
1125 "QEMU itself instead of guest system!: %llx\n",
1126 (unsigned long long)siginfo->ssi_addr);
1127 return;
1129 status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN
1130 | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S
1131 | 0xc0;
1132 kvm_inject_x86_mce(first_cpu, 9, status,
1133 MCG_STATUS_MCIP | MCG_STATUS_RIPV, paddr,
1134 (MCM_ADDR_PHYS << 6) | 0xc, 1);
1135 for (cenv = first_cpu->next_cpu; cenv != NULL; cenv = cenv->next_cpu)
1136 kvm_inject_x86_mce(cenv, 1, MCI_STATUS_VAL | MCI_STATUS_UC,
1137 MCG_STATUS_MCIP | MCG_STATUS_RIPV, 0, 0, 1);
1138 } else
1139 #endif
1141 if (siginfo->ssi_code == BUS_MCEERR_AO)
1142 return;
1143 else if (siginfo->ssi_code == BUS_MCEERR_AR)
1144 hardware_memory_error();
1145 else
1146 sigbus_reraise();
1150 static void on_vcpu(CPUState *env, void (*func)(void *data), void *data)
1152 struct qemu_work_item wi;
1154 if (env == current_env) {
1155 func(data);
1156 return;
1159 wi.func = func;
1160 wi.data = data;
1161 if (!env->kvm_cpu_state.queued_work_first)
1162 env->kvm_cpu_state.queued_work_first = &wi;
1163 else
1164 env->kvm_cpu_state.queued_work_last->next = &wi;
1165 env->kvm_cpu_state.queued_work_last = &wi;
1166 wi.next = NULL;
1167 wi.done = false;
1169 pthread_kill(env->kvm_cpu_state.thread, SIG_IPI);
1170 while (!wi.done)
1171 qemu_cond_wait(&qemu_work_cond);
1174 static void do_kvm_cpu_synchronize_state(void *_env)
1176 CPUState *env = _env;
1178 if (!env->kvm_vcpu_dirty) {
1179 kvm_arch_save_regs(env);
1180 env->kvm_vcpu_dirty = 1;
1184 void kvm_cpu_synchronize_state(CPUState *env)
1186 if (!env->kvm_vcpu_dirty)
1187 on_vcpu(env, do_kvm_cpu_synchronize_state, env);
1190 void kvm_cpu_synchronize_post_reset(CPUState *env)
1192 kvm_arch_load_regs(env, KVM_PUT_RESET_STATE);
1193 env->kvm_vcpu_dirty = 0;
1196 void kvm_cpu_synchronize_post_init(CPUState *env)
1198 kvm_arch_load_regs(env, KVM_PUT_FULL_STATE);
1199 env->kvm_vcpu_dirty = 0;
1202 static void inject_interrupt(void *data)
1204 cpu_interrupt(current_env, (long) data);
1207 void kvm_inject_interrupt(CPUState *env, int mask)
1209 on_vcpu(env, inject_interrupt, (void *) (long) mask);
1212 void kvm_update_interrupt_request(CPUState *env)
1214 int signal = 0;
1216 if (env) {
1217 if (!current_env || !current_env->created)
1218 signal = 1;
1220 * Testing for created here is really redundant
1222 if (current_env && current_env->created &&
1223 env != current_env && !env->kvm_cpu_state.signalled)
1224 signal = 1;
1226 if (signal) {
1227 env->kvm_cpu_state.signalled = 1;
1228 if (env->kvm_cpu_state.thread)
1229 pthread_kill(env->kvm_cpu_state.thread, SIG_IPI);
1234 int kvm_cpu_exec(CPUState *env)
1236 int r;
1238 r = kvm_run(env);
1239 if (r < 0) {
1240 printf("kvm_run returned %d\n", r);
1241 vm_stop(0);
1244 return 0;
1247 int kvm_cpu_is_stopped(CPUState *env)
1249 return !vm_running || env->stopped;
1252 static void flush_queued_work(CPUState *env)
1254 struct qemu_work_item *wi;
1256 if (!env->kvm_cpu_state.queued_work_first)
1257 return;
1259 while ((wi = env->kvm_cpu_state.queued_work_first)) {
1260 env->kvm_cpu_state.queued_work_first = wi->next;
1261 wi->func(wi->data);
1262 wi->done = true;
1264 env->kvm_cpu_state.queued_work_last = NULL;
1265 pthread_cond_broadcast(&qemu_work_cond);
1268 static int kvm_mce_in_exception(CPUState *env)
1270 struct kvm_msr_entry msr_mcg_status = {
1271 .index = MSR_MCG_STATUS,
1273 int r;
1275 r = kvm_get_msrs(env, &msr_mcg_status, 1);
1276 if (r == -1 || r == 0)
1277 return -1;
1278 return !!(msr_mcg_status.data & MCG_STATUS_MCIP);
1281 static void kvm_on_sigbus(CPUState *env, siginfo_t *siginfo)
1283 #if defined(KVM_CAP_MCE) && defined(TARGET_I386)
1284 struct kvm_x86_mce mce = {
1285 .bank = 9,
1287 unsigned long paddr;
1288 int r;
1290 if (env->mcg_cap && siginfo->si_addr
1291 && (siginfo->si_code == BUS_MCEERR_AR
1292 || siginfo->si_code == BUS_MCEERR_AO)) {
1293 if (siginfo->si_code == BUS_MCEERR_AR) {
1294 /* Fake an Intel architectural Data Load SRAR UCR */
1295 mce.status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN
1296 | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S
1297 | MCI_STATUS_AR | 0x134;
1298 mce.misc = (MCM_ADDR_PHYS << 6) | 0xc;
1299 mce.mcg_status = MCG_STATUS_MCIP | MCG_STATUS_EIPV;
1300 } else {
1302 * If there is an MCE excpetion being processed, ignore
1303 * this SRAO MCE
1305 r = kvm_mce_in_exception(env);
1306 if (r == -1)
1307 fprintf(stderr, "Failed to get MCE status\n");
1308 else if (r)
1309 return;
1310 /* Fake an Intel architectural Memory scrubbing UCR */
1311 mce.status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN
1312 | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S
1313 | 0xc0;
1314 mce.misc = (MCM_ADDR_PHYS << 6) | 0xc;
1315 mce.mcg_status = MCG_STATUS_MCIP | MCG_STATUS_RIPV;
1317 if (do_qemu_ram_addr_from_host((void *)siginfo->si_addr, &paddr)) {
1318 fprintf(stderr, "Hardware memory error for memory used by "
1319 "QEMU itself instaed of guest system!\n");
1320 /* Hope we are lucky for AO MCE */
1321 if (siginfo->si_code == BUS_MCEERR_AO)
1322 return;
1323 else
1324 hardware_memory_error();
1326 mce.addr = paddr;
1327 r = kvm_set_mce(env, &mce);
1328 if (r < 0) {
1329 fprintf(stderr, "kvm_set_mce: %s\n", strerror(errno));
1330 abort();
1332 } else
1333 #endif
1335 if (siginfo->si_code == BUS_MCEERR_AO)
1336 return;
1337 else if (siginfo->si_code == BUS_MCEERR_AR)
1338 hardware_memory_error();
1339 else
1340 sigbus_reraise();
1344 static void kvm_main_loop_wait(CPUState *env, int timeout)
1346 struct timespec ts;
1347 int r, e;
1348 siginfo_t siginfo;
1349 sigset_t waitset;
1350 sigset_t chkset;
1352 ts.tv_sec = timeout / 1000;
1353 ts.tv_nsec = (timeout % 1000) * 1000000;
1354 sigemptyset(&waitset);
1355 sigaddset(&waitset, SIG_IPI);
1356 sigaddset(&waitset, SIGBUS);
1358 do {
1359 pthread_mutex_unlock(&qemu_mutex);
1361 r = sigtimedwait(&waitset, &siginfo, &ts);
1362 e = errno;
1364 pthread_mutex_lock(&qemu_mutex);
1366 if (r == -1 && !(e == EAGAIN || e == EINTR)) {
1367 printf("sigtimedwait: %s\n", strerror(e));
1368 exit(1);
1371 switch (r) {
1372 case SIGBUS:
1373 kvm_on_sigbus(env, &siginfo);
1374 break;
1375 default:
1376 break;
1379 r = sigpending(&chkset);
1380 if (r == -1) {
1381 printf("sigpending: %s\n", strerror(e));
1382 exit(1);
1384 } while (sigismember(&chkset, SIG_IPI) || sigismember(&chkset, SIGBUS));
1386 cpu_single_env = env;
1387 flush_queued_work(env);
1389 if (env->stop) {
1390 env->stop = 0;
1391 env->stopped = 1;
1392 pthread_cond_signal(&qemu_pause_cond);
1395 env->kvm_cpu_state.signalled = 0;
1398 static int all_threads_paused(void)
1400 CPUState *penv = first_cpu;
1402 while (penv) {
1403 if (penv->stop)
1404 return 0;
1405 penv = (CPUState *) penv->next_cpu;
1408 return 1;
1411 static void pause_all_threads(void)
1413 CPUState *penv = first_cpu;
1415 while (penv) {
1416 if (penv != cpu_single_env) {
1417 penv->stop = 1;
1418 pthread_kill(penv->kvm_cpu_state.thread, SIG_IPI);
1419 } else {
1420 penv->stop = 0;
1421 penv->stopped = 1;
1422 cpu_exit(penv);
1424 penv = (CPUState *) penv->next_cpu;
1427 while (!all_threads_paused())
1428 qemu_cond_wait(&qemu_pause_cond);
1431 static void resume_all_threads(void)
1433 CPUState *penv = first_cpu;
1435 assert(!cpu_single_env);
1437 while (penv) {
1438 penv->stop = 0;
1439 penv->stopped = 0;
1440 pthread_kill(penv->kvm_cpu_state.thread, SIG_IPI);
1441 penv = (CPUState *) penv->next_cpu;
1445 static void kvm_vm_state_change_handler(void *context, int running, int reason)
1447 if (running)
1448 resume_all_threads();
1449 else
1450 pause_all_threads();
1453 static void setup_kernel_sigmask(CPUState *env)
1455 sigset_t set;
1457 sigemptyset(&set);
1458 sigaddset(&set, SIGUSR2);
1459 sigaddset(&set, SIGIO);
1460 sigaddset(&set, SIGALRM);
1461 sigprocmask(SIG_BLOCK, &set, NULL);
1463 sigprocmask(SIG_BLOCK, NULL, &set);
1464 sigdelset(&set, SIG_IPI);
1465 sigdelset(&set, SIGBUS);
1467 kvm_set_signal_mask(env, &set);
1470 static void qemu_kvm_system_reset(void)
1472 pause_all_threads();
1474 qemu_system_reset();
1476 resume_all_threads();
1479 static void process_irqchip_events(CPUState *env)
1481 kvm_arch_process_irqchip_events(env);
1482 if (kvm_arch_has_work(env))
1483 env->halted = 0;
1486 static int kvm_main_loop_cpu(CPUState *env)
1488 while (1) {
1489 int run_cpu = !kvm_cpu_is_stopped(env);
1490 if (run_cpu && !kvm_irqchip_in_kernel()) {
1491 process_irqchip_events(env);
1492 run_cpu = !env->halted;
1494 if (run_cpu) {
1495 kvm_cpu_exec(env);
1496 kvm_main_loop_wait(env, 0);
1497 } else {
1498 kvm_main_loop_wait(env, 1000);
1501 pthread_mutex_unlock(&qemu_mutex);
1502 return 0;
1505 static void *ap_main_loop(void *_env)
1507 CPUState *env = _env;
1508 sigset_t signals;
1509 #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
1510 struct ioperm_data *data = NULL;
1511 #endif
1513 current_env = env;
1514 env->thread_id = kvm_get_thread_id();
1515 sigfillset(&signals);
1516 sigprocmask(SIG_BLOCK, &signals, NULL);
1518 #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
1519 /* do ioperm for io ports of assigned devices */
1520 QLIST_FOREACH(data, &ioperm_head, entries)
1521 on_vcpu(env, kvm_arch_do_ioperm, data);
1522 #endif
1524 pthread_mutex_lock(&qemu_mutex);
1525 cpu_single_env = env;
1527 kvm_create_vcpu(env, env->cpu_index);
1528 setup_kernel_sigmask(env);
1530 /* signal VCPU creation */
1531 current_env->created = 1;
1532 pthread_cond_signal(&qemu_vcpu_cond);
1534 /* and wait for machine initialization */
1535 while (!qemu_system_ready)
1536 qemu_cond_wait(&qemu_system_cond);
1538 /* re-initialize cpu_single_env after re-acquiring qemu_mutex */
1539 cpu_single_env = env;
1541 kvm_main_loop_cpu(env);
1542 return NULL;
1545 int kvm_init_vcpu(CPUState *env)
1547 pthread_create(&env->kvm_cpu_state.thread, NULL, ap_main_loop, env);
1549 while (env->created == 0)
1550 qemu_cond_wait(&qemu_vcpu_cond);
1552 return 0;
1555 int kvm_vcpu_inited(CPUState *env)
1557 return env->created;
1560 #ifdef TARGET_I386
1561 void kvm_hpet_disable_kpit(void)
1563 struct kvm_pit_state2 ps2;
1565 kvm_get_pit2(kvm_context, &ps2);
1566 ps2.flags |= KVM_PIT_FLAGS_HPET_LEGACY;
1567 kvm_set_pit2(kvm_context, &ps2);
1570 void kvm_hpet_enable_kpit(void)
1572 struct kvm_pit_state2 ps2;
1574 kvm_get_pit2(kvm_context, &ps2);
1575 ps2.flags &= ~KVM_PIT_FLAGS_HPET_LEGACY;
1576 kvm_set_pit2(kvm_context, &ps2);
1578 #endif
1580 int kvm_init_ap(void)
1582 struct sigaction action;
1584 qemu_add_vm_change_state_handler(kvm_vm_state_change_handler, NULL);
1586 signal(SIG_IPI, sig_ipi_handler);
1588 memset(&action, 0, sizeof(action));
1589 action.sa_flags = SA_SIGINFO;
1590 action.sa_sigaction = (void (*)(int, siginfo_t*, void*))sigbus_handler;
1591 sigaction(SIGBUS, &action, NULL);
1592 prctl(PR_MCE_KILL, 1, 1, 0, 0);
1593 return 0;
1596 void qemu_kvm_notify_work(void)
1598 /* Write 8 bytes to be compatible with eventfd. */
1599 static uint64_t val = 1;
1600 ssize_t ret;
1602 if (io_thread_fd == -1)
1603 return;
1605 do {
1606 ret = write(io_thread_fd, &val, sizeof(val));
1607 } while (ret < 0 && errno == EINTR);
1609 /* EAGAIN is fine in case we have a pipe. */
1610 if (ret < 0 && errno != EAGAIN) {
1611 fprintf(stderr, "qemu_kvm_notify_work: write() filed: %s\n",
1612 strerror(errno));
1613 exit (1);
1617 /* If we have signalfd, we mask out the signals we want to handle and then
1618 * use signalfd to listen for them. We rely on whatever the current signal
1619 * handler is to dispatch the signals when we receive them.
1622 static void sigfd_handler(void *opaque)
1624 int fd = (unsigned long) opaque;
1625 struct qemu_signalfd_siginfo info;
1626 struct sigaction action;
1627 ssize_t len;
1629 while (1) {
1630 do {
1631 len = read(fd, &info, sizeof(info));
1632 } while (len == -1 && errno == EINTR);
1634 if (len == -1 && errno == EAGAIN)
1635 break;
1637 if (len != sizeof(info)) {
1638 printf("read from sigfd returned %zd: %m\n", len);
1639 return;
1642 sigaction(info.ssi_signo, NULL, &action);
1643 if ((action.sa_flags & SA_SIGINFO) && action.sa_sigaction)
1644 action.sa_sigaction(info.ssi_signo,
1645 (siginfo_t *)&info, NULL);
1646 else if (action.sa_handler)
1647 action.sa_handler(info.ssi_signo);
1652 /* Used to break IO thread out of select */
1653 static void io_thread_wakeup(void *opaque)
1655 int fd = (unsigned long) opaque;
1656 ssize_t len;
1657 char buffer[512];
1659 /* Drain the notify pipe. For eventfd, only 8 bytes will be read. */
1660 do {
1661 len = read(fd, buffer, sizeof(buffer));
1662 } while ((len == -1 && errno == EINTR) || len == sizeof(buffer));
1665 int kvm_main_loop(void)
1667 int fds[2];
1668 sigset_t mask;
1669 int sigfd;
1671 io_thread = pthread_self();
1672 qemu_system_ready = 1;
1674 if (qemu_eventfd(fds) == -1) {
1675 fprintf(stderr, "failed to create eventfd\n");
1676 return -errno;
1679 fcntl(fds[0], F_SETFL, O_NONBLOCK);
1680 fcntl(fds[1], F_SETFL, O_NONBLOCK);
1682 qemu_set_fd_handler2(fds[0], NULL, io_thread_wakeup, NULL,
1683 (void *)(unsigned long) fds[0]);
1685 io_thread_fd = fds[1];
1687 sigemptyset(&mask);
1688 sigaddset(&mask, SIGIO);
1689 sigaddset(&mask, SIGALRM);
1690 sigaddset(&mask, SIGBUS);
1691 sigprocmask(SIG_BLOCK, &mask, NULL);
1693 sigfd = qemu_signalfd(&mask);
1694 if (sigfd == -1) {
1695 fprintf(stderr, "failed to create signalfd\n");
1696 return -errno;
1699 fcntl(sigfd, F_SETFL, O_NONBLOCK);
1701 qemu_set_fd_handler2(sigfd, NULL, sigfd_handler, NULL,
1702 (void *)(unsigned long) sigfd);
1704 pthread_cond_broadcast(&qemu_system_cond);
1706 io_thread_sigfd = sigfd;
1707 cpu_single_env = NULL;
1709 while (1) {
1710 main_loop_wait(0);
1711 if (qemu_shutdown_requested()) {
1712 monitor_protocol_event(QEVENT_SHUTDOWN, NULL);
1713 if (qemu_no_shutdown()) {
1714 vm_stop(0);
1715 } else
1716 break;
1717 } else if (qemu_powerdown_requested()) {
1718 monitor_protocol_event(QEVENT_POWERDOWN, NULL);
1719 qemu_irq_raise(qemu_system_powerdown);
1720 } else if (qemu_reset_requested()) {
1721 qemu_kvm_system_reset();
1722 } else if (kvm_debug_cpu_requested) {
1723 gdb_set_stop_cpu(kvm_debug_cpu_requested);
1724 vm_stop(EXCP_DEBUG);
1725 kvm_debug_cpu_requested = NULL;
1729 pause_all_threads();
1730 pthread_mutex_unlock(&qemu_mutex);
1732 return 0;
1735 #if !defined(TARGET_I386)
1736 int kvm_arch_init_irq_routing(void)
1738 return 0;
1740 #endif
1742 extern int no_hpet;
1744 static int kvm_create_context(void)
1746 static const char upgrade_note[] =
1747 "Please upgrade to at least kernel 2.6.29 or recent kvm-kmod\n"
1748 "(see http://sourceforge.net/projects/kvm).\n";
1750 int r;
1752 if (!kvm_irqchip) {
1753 kvm_disable_irqchip_creation(kvm_context);
1755 if (!kvm_pit) {
1756 kvm_disable_pit_creation(kvm_context);
1758 if (kvm_create(kvm_context, 0, NULL) < 0) {
1759 kvm_finalize(kvm_state);
1760 return -1;
1762 r = kvm_arch_qemu_create_context();
1763 if (r < 0) {
1764 kvm_finalize(kvm_state);
1765 return -1;
1767 if (kvm_pit && !kvm_pit_reinject) {
1768 if (kvm_reinject_control(kvm_context, 0)) {
1769 fprintf(stderr, "failure to disable in-kernel PIT reinjection\n");
1770 return -1;
1774 /* There was a nasty bug in < kvm-80 that prevents memory slots from being
1775 * destroyed properly. Since we rely on this capability, refuse to work
1776 * with any kernel without this capability. */
1777 if (!kvm_check_extension(kvm_state, KVM_CAP_DESTROY_MEMORY_REGION_WORKS)) {
1778 fprintf(stderr,
1779 "KVM kernel module broken (DESTROY_MEMORY_REGION).\n%s",
1780 upgrade_note);
1781 return -EINVAL;
1784 r = kvm_arch_init_irq_routing();
1785 if (r < 0) {
1786 return r;
1789 kvm_state->vcpu_events = 0;
1790 #ifdef KVM_CAP_VCPU_EVENTS
1791 kvm_state->vcpu_events = kvm_check_extension(kvm_state, KVM_CAP_VCPU_EVENTS);
1792 #endif
1794 kvm_state->debugregs = 0;
1795 #ifdef KVM_CAP_DEBUGREGS
1796 kvm_state->debugregs = kvm_check_extension(kvm_state, KVM_CAP_DEBUGREGS);
1797 #endif
1799 kvm_init_ap();
1800 if (kvm_irqchip) {
1801 if (!qemu_kvm_has_gsi_routing()) {
1802 irq0override = 0;
1803 #ifdef TARGET_I386
1804 /* if kernel can't do irq routing, interrupt source
1805 * override 0->2 can not be set up as required by hpet,
1806 * so disable hpet.
1808 no_hpet = 1;
1809 } else if (!qemu_kvm_has_pit_state2()) {
1810 no_hpet = 1;
1812 #else
1814 #endif
1817 return 0;
1820 #ifdef KVM_CAP_IRQCHIP
1822 int kvm_set_irq(int irq, int level, int *status)
1824 return kvm_set_irq_level(kvm_context, irq, level, status);
1827 #endif
1829 void kvm_mutex_unlock(void)
1831 assert(!cpu_single_env);
1832 pthread_mutex_unlock(&qemu_mutex);
1835 void kvm_mutex_lock(void)
1837 pthread_mutex_lock(&qemu_mutex);
1838 cpu_single_env = NULL;
1841 void qemu_mutex_unlock_iothread(void)
1843 if (kvm_enabled())
1844 kvm_mutex_unlock();
1847 void qemu_mutex_lock_iothread(void)
1849 if (kvm_enabled())
1850 kvm_mutex_lock();
1853 #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
1854 void kvm_add_ioperm_data(struct ioperm_data *data)
1856 QLIST_INSERT_HEAD(&ioperm_head, data, entries);
1859 void kvm_remove_ioperm_data(unsigned long start_port, unsigned long num)
1861 struct ioperm_data *data;
1863 data = QLIST_FIRST(&ioperm_head);
1864 while (data) {
1865 struct ioperm_data *next = QLIST_NEXT(data, entries);
1867 if (data->start_port == start_port && data->num == num) {
1868 QLIST_REMOVE(data, entries);
1869 qemu_free(data);
1872 data = next;
1876 void kvm_ioperm(CPUState *env, void *data)
1878 if (kvm_enabled() && qemu_system_ready)
1879 on_vcpu(env, kvm_arch_do_ioperm, data);
1882 #endif
1884 int kvm_set_boot_cpu_id(uint32_t id)
1886 return kvm_set_boot_vcpu_id(kvm_context, id);
1889 #ifdef TARGET_I386
1890 #ifdef KVM_CAP_MCE
1891 struct kvm_x86_mce_data {
1892 CPUState *env;
1893 struct kvm_x86_mce *mce;
1894 int abort_on_error;
1897 static void kvm_do_inject_x86_mce(void *_data)
1899 struct kvm_x86_mce_data *data = _data;
1900 int r;
1902 /* If there is an MCE excpetion being processed, ignore this SRAO MCE */
1903 r = kvm_mce_in_exception(data->env);
1904 if (r == -1)
1905 fprintf(stderr, "Failed to get MCE status\n");
1906 else if (r && !(data->mce->status & MCI_STATUS_AR))
1907 return;
1908 r = kvm_set_mce(data->env, data->mce);
1909 if (r < 0) {
1910 perror("kvm_set_mce FAILED");
1911 if (data->abort_on_error)
1912 abort();
1915 #endif
1917 void kvm_inject_x86_mce(CPUState *cenv, int bank, uint64_t status,
1918 uint64_t mcg_status, uint64_t addr, uint64_t misc,
1919 int abort_on_error)
1921 #ifdef KVM_CAP_MCE
1922 struct kvm_x86_mce mce = {
1923 .bank = bank,
1924 .status = status,
1925 .mcg_status = mcg_status,
1926 .addr = addr,
1927 .misc = misc,
1929 struct kvm_x86_mce_data data = {
1930 .env = cenv,
1931 .mce = &mce,
1932 .abort_on_error = abort_on_error,
1935 if (!cenv->mcg_cap) {
1936 fprintf(stderr, "MCE support is not enabled!\n");
1937 return;
1939 on_vcpu(cenv, kvm_do_inject_x86_mce, &data);
1940 #else
1941 if (abort_on_error)
1942 abort();
1943 #endif
1945 #endif