Merge commit '88c8e03f5ddca8c8448dbb99d9de07d87b03ddd6' into upstream-merge
[qemu-kvm/amd-iommu.git] / qemu-kvm.c
blob35a4c8af17d0df08e2c869864db4b213c1b6c7ab
1 /*
2 * qemu/kvm integration
4 * Copyright (C) 2006-2008 Qumranet Technologies
6 * Licensed under the terms of the GNU GPL version 2 or higher.
7 */
8 #include "config.h"
9 #include "config-host.h"
11 #include <assert.h>
12 #include <string.h>
13 #include "hw/hw.h"
14 #include "sysemu.h"
15 #include "qemu-common.h"
16 #include "console.h"
17 #include "block.h"
18 #include "compatfd.h"
19 #include "gdbstub.h"
20 #include "monitor.h"
22 #include "qemu-kvm.h"
23 #include "libkvm.h"
25 #include <pthread.h>
26 #include <sys/utsname.h>
27 #include <sys/syscall.h>
28 #include <sys/mman.h>
29 #include <sys/ioctl.h>
30 #include "compatfd.h"
31 #include <sys/prctl.h>
33 #define false 0
34 #define true 1
36 #ifndef PR_MCE_KILL
37 #define PR_MCE_KILL 33
38 #endif
40 #ifndef BUS_MCEERR_AR
41 #define BUS_MCEERR_AR 4
42 #endif
43 #ifndef BUS_MCEERR_AO
44 #define BUS_MCEERR_AO 5
45 #endif
47 #define EXPECTED_KVM_API_VERSION 12
49 #if EXPECTED_KVM_API_VERSION != KVM_API_VERSION
50 #error libkvm: userspace and kernel version mismatch
51 #endif
53 int kvm_irqchip = 1;
54 int kvm_pit = 1;
55 int kvm_pit_reinject = 1;
56 int kvm_nested = 0;
59 KVMState *kvm_state;
60 kvm_context_t kvm_context;
62 pthread_mutex_t qemu_mutex = PTHREAD_MUTEX_INITIALIZER;
63 pthread_cond_t qemu_vcpu_cond = PTHREAD_COND_INITIALIZER;
64 pthread_cond_t qemu_system_cond = PTHREAD_COND_INITIALIZER;
65 pthread_cond_t qemu_pause_cond = PTHREAD_COND_INITIALIZER;
66 pthread_cond_t qemu_work_cond = PTHREAD_COND_INITIALIZER;
67 __thread CPUState *current_env;
69 static int qemu_system_ready;
71 #define SIG_IPI (SIGRTMIN+4)
73 pthread_t io_thread;
74 static int io_thread_fd = -1;
75 static int io_thread_sigfd = -1;
77 static CPUState *kvm_debug_cpu_requested;
79 #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
80 /* The list of ioperm_data */
81 static QLIST_HEAD(, ioperm_data) ioperm_head;
82 #endif
84 #define ALIGN(x, y) (((x)+(y)-1) & ~((y)-1))
86 int kvm_abi = EXPECTED_KVM_API_VERSION;
87 int kvm_page_size;
89 #ifdef KVM_CAP_SET_GUEST_DEBUG
90 static int kvm_debug(CPUState *env,
91 struct kvm_debug_exit_arch *arch_info)
93 int handle = kvm_arch_debug(arch_info);
95 if (handle) {
96 kvm_debug_cpu_requested = env;
97 env->stopped = 1;
99 return handle;
101 #endif
103 static int handle_unhandled(uint64_t reason)
105 fprintf(stderr, "kvm: unhandled exit %" PRIx64 "\n", reason);
106 return -EINVAL;
110 static inline void set_gsi(kvm_context_t kvm, unsigned int gsi)
112 uint32_t *bitmap = kvm->used_gsi_bitmap;
114 if (gsi < kvm->max_gsi)
115 bitmap[gsi / 32] |= 1U << (gsi % 32);
116 else
117 DPRINTF("Invalid GSI %u\n", gsi);
120 static inline void clear_gsi(kvm_context_t kvm, unsigned int gsi)
122 uint32_t *bitmap = kvm->used_gsi_bitmap;
124 if (gsi < kvm->max_gsi)
125 bitmap[gsi / 32] &= ~(1U << (gsi % 32));
126 else
127 DPRINTF("Invalid GSI %u\n", gsi);
130 static int kvm_create_context(void);
132 int kvm_init(int smp_cpus)
134 int fd;
135 int r, gsi_count;
138 fd = open("/dev/kvm", O_RDWR);
139 if (fd == -1) {
140 perror("open /dev/kvm");
141 return -1;
143 r = ioctl(fd, KVM_GET_API_VERSION, 0);
144 if (r == -1) {
145 fprintf(stderr,
146 "kvm kernel version too old: "
147 "KVM_GET_API_VERSION ioctl not supported\n");
148 goto out_close;
150 if (r < EXPECTED_KVM_API_VERSION) {
151 fprintf(stderr, "kvm kernel version too old: "
152 "We expect API version %d or newer, but got "
153 "version %d\n", EXPECTED_KVM_API_VERSION, r);
154 goto out_close;
156 if (r > EXPECTED_KVM_API_VERSION) {
157 fprintf(stderr, "kvm userspace version too old\n");
158 goto out_close;
160 kvm_abi = r;
161 kvm_page_size = getpagesize();
162 kvm_state = qemu_mallocz(sizeof(*kvm_state));
163 kvm_context = &kvm_state->kvm_context;
165 kvm_state->fd = fd;
166 kvm_state->vmfd = -1;
167 kvm_context->opaque = cpu_single_env;
168 kvm_context->dirty_pages_log_all = 0;
169 kvm_context->no_irqchip_creation = 0;
170 kvm_context->no_pit_creation = 0;
172 #ifdef KVM_CAP_SET_GUEST_DEBUG
173 QTAILQ_INIT(&kvm_state->kvm_sw_breakpoints);
174 #endif
176 gsi_count = kvm_get_gsi_count(kvm_context);
177 if (gsi_count > 0) {
178 int gsi_bits, i;
180 /* Round up so we can search ints using ffs */
181 gsi_bits = ALIGN(gsi_count, 32);
182 kvm_context->used_gsi_bitmap = qemu_mallocz(gsi_bits / 8);
183 kvm_context->max_gsi = gsi_bits;
185 /* Mark any over-allocated bits as already in use */
186 for (i = gsi_count; i < gsi_bits; i++)
187 set_gsi(kvm_context, i);
190 kvm_cpu_register_phys_memory_client();
192 pthread_mutex_lock(&qemu_mutex);
193 return kvm_create_context();
195 out_close:
196 close(fd);
197 return -1;
200 static void kvm_finalize(KVMState *s)
202 /* FIXME
203 if (kvm->vcpu_fd[0] != -1)
204 close(kvm->vcpu_fd[0]);
205 if (kvm->vm_fd != -1)
206 close(kvm->vm_fd);
208 close(s->fd);
209 free(s);
212 void kvm_disable_irqchip_creation(kvm_context_t kvm)
214 kvm->no_irqchip_creation = 1;
217 void kvm_disable_pit_creation(kvm_context_t kvm)
219 kvm->no_pit_creation = 1;
222 static void kvm_reset_vcpu(void *opaque)
224 CPUState *env = opaque;
226 kvm_arch_cpu_reset(env);
229 static void kvm_create_vcpu(CPUState *env, int id)
231 long mmap_size;
232 int r;
233 KVMState *s = kvm_state;
235 r = kvm_vm_ioctl(kvm_state, KVM_CREATE_VCPU, id);
236 if (r < 0) {
237 fprintf(stderr, "kvm_create_vcpu: %m\n");
238 fprintf(stderr, "Failed to create vCPU. Check the -smp parameter.\n");
239 goto err;
242 env->kvm_fd = r;
243 env->kvm_state = kvm_state;
245 mmap_size = kvm_ioctl(kvm_state, KVM_GET_VCPU_MMAP_SIZE, 0);
246 if (mmap_size < 0) {
247 fprintf(stderr, "get vcpu mmap size: %m\n");
248 goto err_fd;
250 env->kvm_run =
251 mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, env->kvm_fd,
253 if (env->kvm_run == MAP_FAILED) {
254 fprintf(stderr, "mmap vcpu area: %m\n");
255 goto err_fd;
258 #ifdef KVM_CAP_COALESCED_MMIO
259 if (s->coalesced_mmio && !s->coalesced_mmio_ring)
260 s->coalesced_mmio_ring = (void *) env->kvm_run +
261 s->coalesced_mmio * PAGE_SIZE;
262 #endif
264 r = kvm_arch_init_vcpu(env);
265 if (r == 0) {
266 qemu_register_reset(kvm_reset_vcpu, env);
269 return;
270 err_fd:
271 close(env->kvm_fd);
272 err:
273 /* We're no good with semi-broken states. */
274 abort();
277 static int kvm_set_boot_vcpu_id(kvm_context_t kvm, uint32_t id)
279 #ifdef KVM_CAP_SET_BOOT_CPU_ID
280 int r = kvm_ioctl(kvm_state, KVM_CHECK_EXTENSION, KVM_CAP_SET_BOOT_CPU_ID);
281 if (r > 0)
282 return kvm_vm_ioctl(kvm_state, KVM_SET_BOOT_CPU_ID, id);
283 return -ENOSYS;
284 #else
285 return -ENOSYS;
286 #endif
289 int kvm_create_vm(kvm_context_t kvm)
291 int fd;
292 #ifdef KVM_CAP_IRQ_ROUTING
293 kvm->irq_routes = qemu_mallocz(sizeof(*kvm->irq_routes));
294 kvm->nr_allocated_irq_routes = 0;
295 #endif
297 fd = kvm_ioctl(kvm_state, KVM_CREATE_VM, 0);
298 if (fd < 0) {
299 fprintf(stderr, "kvm_create_vm: %m\n");
300 return -1;
302 kvm_state->vmfd = fd;
303 return 0;
306 static int kvm_create_default_phys_mem(kvm_context_t kvm,
307 unsigned long phys_mem_bytes,
308 void **vm_mem)
310 #ifdef KVM_CAP_USER_MEMORY
311 int r = kvm_ioctl(kvm_state, KVM_CHECK_EXTENSION, KVM_CAP_USER_MEMORY);
312 if (r > 0)
313 return 0;
314 fprintf(stderr,
315 "Hypervisor too old: KVM_CAP_USER_MEMORY extension not supported\n");
316 #else
317 #error Hypervisor too old: KVM_CAP_USER_MEMORY extension not supported
318 #endif
319 return -1;
322 void kvm_create_irqchip(kvm_context_t kvm)
324 int r;
326 kvm->irqchip_in_kernel = 0;
327 #ifdef KVM_CAP_IRQCHIP
328 if (!kvm->no_irqchip_creation) {
329 r = kvm_ioctl(kvm_state, KVM_CHECK_EXTENSION, KVM_CAP_IRQCHIP);
330 if (r > 0) { /* kernel irqchip supported */
331 r = kvm_vm_ioctl(kvm_state, KVM_CREATE_IRQCHIP);
332 if (r >= 0) {
333 kvm->irqchip_inject_ioctl = KVM_IRQ_LINE;
334 #if defined(KVM_CAP_IRQ_INJECT_STATUS) && defined(KVM_IRQ_LINE_STATUS)
335 r = kvm_ioctl(kvm_state, KVM_CHECK_EXTENSION,
336 KVM_CAP_IRQ_INJECT_STATUS);
337 if (r > 0)
338 kvm->irqchip_inject_ioctl = KVM_IRQ_LINE_STATUS;
339 #endif
340 kvm->irqchip_in_kernel = 1;
341 } else
342 fprintf(stderr, "Create kernel PIC irqchip failed\n");
345 #endif
346 kvm_state->irqchip_in_kernel = kvm->irqchip_in_kernel;
349 int kvm_create(kvm_context_t kvm, unsigned long phys_mem_bytes, void **vm_mem)
351 int r, i;
353 r = kvm_create_vm(kvm);
354 if (r < 0)
355 return r;
356 r = kvm_arch_create(kvm, phys_mem_bytes, vm_mem);
357 if (r < 0)
358 return r;
359 for (i = 0; i < ARRAY_SIZE(kvm_state->slots); i++)
360 kvm_state->slots[i].slot = i;
362 r = kvm_create_default_phys_mem(kvm, phys_mem_bytes, vm_mem);
363 if (r < 0)
364 return r;
365 kvm_create_irqchip(kvm);
367 return 0;
370 #ifdef KVM_CAP_IRQCHIP
372 int kvm_set_irq_level(kvm_context_t kvm, int irq, int level, int *status)
374 struct kvm_irq_level event;
375 int r;
377 if (!kvm->irqchip_in_kernel)
378 return 0;
379 event.level = level;
380 event.irq = irq;
381 r = kvm_vm_ioctl(kvm_state, kvm->irqchip_inject_ioctl, &event);
382 if (r < 0)
383 perror("kvm_set_irq_level");
385 if (status) {
386 #ifdef KVM_CAP_IRQ_INJECT_STATUS
387 *status =
388 (kvm->irqchip_inject_ioctl == KVM_IRQ_LINE) ? 1 : event.status;
389 #else
390 *status = 1;
391 #endif
394 return 1;
397 int kvm_get_irqchip(kvm_context_t kvm, struct kvm_irqchip *chip)
399 int r;
401 if (!kvm->irqchip_in_kernel)
402 return 0;
403 r = kvm_vm_ioctl(kvm_state, KVM_GET_IRQCHIP, chip);
404 if (r < 0) {
405 perror("kvm_get_irqchip\n");
407 return r;
410 int kvm_set_irqchip(kvm_context_t kvm, struct kvm_irqchip *chip)
412 int r;
414 if (!kvm->irqchip_in_kernel)
415 return 0;
416 r = kvm_vm_ioctl(kvm_state, KVM_SET_IRQCHIP, chip);
417 if (r < 0) {
418 perror("kvm_set_irqchip\n");
420 return r;
423 #endif
425 static int handle_debug(CPUState *env)
427 #ifdef KVM_CAP_SET_GUEST_DEBUG
428 struct kvm_run *run = env->kvm_run;
430 return kvm_debug(env, &run->debug.arch);
431 #else
432 return 0;
433 #endif
436 int kvm_get_regs(CPUState *env, struct kvm_regs *regs)
438 return kvm_vcpu_ioctl(env, KVM_GET_REGS, regs);
441 int kvm_set_regs(CPUState *env, struct kvm_regs *regs)
443 return kvm_vcpu_ioctl(env, KVM_SET_REGS, regs);
446 int kvm_get_fpu(CPUState *env, struct kvm_fpu *fpu)
448 return kvm_vcpu_ioctl(env, KVM_GET_FPU, fpu);
451 int kvm_set_fpu(CPUState *env, struct kvm_fpu *fpu)
453 return kvm_vcpu_ioctl(env, KVM_SET_FPU, fpu);
456 int kvm_get_sregs(CPUState *env, struct kvm_sregs *sregs)
458 return kvm_vcpu_ioctl(env, KVM_GET_SREGS, sregs);
461 int kvm_set_sregs(CPUState *env, struct kvm_sregs *sregs)
463 return kvm_vcpu_ioctl(env, KVM_SET_SREGS, sregs);
466 #ifdef KVM_CAP_MP_STATE
467 int kvm_get_mpstate(CPUState *env, struct kvm_mp_state *mp_state)
469 int r;
471 r = kvm_ioctl(kvm_state, KVM_CHECK_EXTENSION, KVM_CAP_MP_STATE);
472 if (r > 0)
473 return kvm_vcpu_ioctl(env, KVM_GET_MP_STATE, mp_state);
474 return -ENOSYS;
477 int kvm_set_mpstate(CPUState *env, struct kvm_mp_state *mp_state)
479 int r;
481 r = kvm_ioctl(kvm_state, KVM_CHECK_EXTENSION, KVM_CAP_MP_STATE);
482 if (r > 0)
483 return kvm_vcpu_ioctl(env, KVM_SET_MP_STATE, mp_state);
484 return -ENOSYS;
486 #endif
488 static int handle_mmio(CPUState *env)
490 unsigned long addr = env->kvm_run->mmio.phys_addr;
491 struct kvm_run *kvm_run = env->kvm_run;
492 void *data = kvm_run->mmio.data;
494 /* hack: Red Hat 7.1 generates these weird accesses. */
495 if ((addr > 0xa0000 - 4 && addr <= 0xa0000) && kvm_run->mmio.len == 3)
496 return 0;
498 cpu_physical_memory_rw(addr, data, kvm_run->mmio.len, kvm_run->mmio.is_write);
499 return 0;
502 int handle_io_window(kvm_context_t kvm)
504 return 1;
507 int handle_shutdown(kvm_context_t kvm, CPUState *env)
509 /* stop the current vcpu from going back to guest mode */
510 env->stopped = 1;
512 qemu_system_reset_request();
513 return 1;
516 static inline void push_nmi(kvm_context_t kvm)
518 #ifdef KVM_CAP_USER_NMI
519 kvm_arch_push_nmi(kvm->opaque);
520 #endif /* KVM_CAP_USER_NMI */
523 void post_kvm_run(kvm_context_t kvm, CPUState *env)
525 pthread_mutex_lock(&qemu_mutex);
526 kvm_arch_post_run(env, env->kvm_run);
527 cpu_single_env = env;
530 int pre_kvm_run(kvm_context_t kvm, CPUState *env)
532 kvm_arch_pre_run(env, env->kvm_run);
534 pthread_mutex_unlock(&qemu_mutex);
535 return 0;
538 int kvm_is_ready_for_interrupt_injection(CPUState *env)
540 return env->kvm_run->ready_for_interrupt_injection;
543 int kvm_run(CPUState *env)
545 int r;
546 kvm_context_t kvm = &env->kvm_state->kvm_context;
547 struct kvm_run *run = env->kvm_run;
548 int fd = env->kvm_fd;
550 again:
551 if (env->kvm_vcpu_dirty) {
552 kvm_arch_load_regs(env, KVM_PUT_RUNTIME_STATE);
553 env->kvm_vcpu_dirty = 0;
555 push_nmi(kvm);
556 #if !defined(__s390__)
557 if (!kvm->irqchip_in_kernel)
558 run->request_interrupt_window = kvm_arch_try_push_interrupts(env);
559 #endif
561 r = pre_kvm_run(kvm, env);
562 if (r)
563 return r;
564 r = ioctl(fd, KVM_RUN, 0);
566 if (r == -1 && errno != EINTR && errno != EAGAIN) {
567 r = -errno;
568 post_kvm_run(kvm, env);
569 fprintf(stderr, "kvm_run: %s\n", strerror(-r));
570 return r;
573 post_kvm_run(kvm, env);
575 kvm_flush_coalesced_mmio_buffer();
577 #if !defined(__s390__)
578 if (r == -1) {
579 r = handle_io_window(kvm);
580 goto more;
582 #endif
583 if (1) {
584 switch (run->exit_reason) {
585 case KVM_EXIT_UNKNOWN:
586 r = handle_unhandled(run->hw.hardware_exit_reason);
587 break;
588 case KVM_EXIT_FAIL_ENTRY:
589 r = handle_unhandled(run->fail_entry.hardware_entry_failure_reason);
590 break;
591 case KVM_EXIT_EXCEPTION:
592 fprintf(stderr, "exception %d (%x)\n", run->ex.exception,
593 run->ex.error_code);
594 kvm_show_regs(env);
595 kvm_show_code(env);
596 abort();
597 break;
598 case KVM_EXIT_IO:
599 r = kvm_handle_io(run->io.port,
600 (uint8_t *)run + run->io.data_offset,
601 run->io.direction,
602 run->io.size,
603 run->io.count);
604 r = 0;
605 break;
606 case KVM_EXIT_DEBUG:
607 r = handle_debug(env);
608 break;
609 case KVM_EXIT_MMIO:
610 r = handle_mmio(env);
611 break;
612 case KVM_EXIT_HLT:
613 r = kvm_arch_halt(env);
614 break;
615 case KVM_EXIT_IRQ_WINDOW_OPEN:
616 break;
617 case KVM_EXIT_SHUTDOWN:
618 r = handle_shutdown(kvm, env);
619 break;
620 #if defined(__s390__)
621 case KVM_EXIT_S390_SIEIC:
622 r = kvm_s390_handle_intercept(kvm, env, run);
623 break;
624 case KVM_EXIT_S390_RESET:
625 r = kvm_s390_handle_reset(kvm, env, run);
626 break;
627 #endif
628 case KVM_EXIT_INTERNAL_ERROR:
629 kvm_handle_internal_error(env, run);
630 r = 1;
631 break;
632 default:
633 if (kvm_arch_run(env)) {
634 fprintf(stderr, "unhandled vm exit: 0x%x\n", run->exit_reason);
635 kvm_show_regs(env);
636 abort();
638 break;
641 more:
642 if (!r)
643 goto again;
644 return r;
647 int kvm_inject_irq(CPUState *env, unsigned irq)
649 struct kvm_interrupt intr;
651 intr.irq = irq;
652 return kvm_vcpu_ioctl(env, KVM_INTERRUPT, &intr);
655 int kvm_inject_nmi(CPUState *env)
657 #ifdef KVM_CAP_USER_NMI
658 return kvm_vcpu_ioctl(env, KVM_NMI);
659 #else
660 return -ENOSYS;
661 #endif
664 int kvm_init_coalesced_mmio(kvm_context_t kvm)
666 int r = 0;
667 kvm_state->coalesced_mmio = 0;
668 #ifdef KVM_CAP_COALESCED_MMIO
669 r = kvm_ioctl(kvm_state, KVM_CHECK_EXTENSION, KVM_CAP_COALESCED_MMIO);
670 if (r > 0) {
671 kvm_state->coalesced_mmio = r;
672 return 0;
674 #endif
675 return r;
678 #ifdef KVM_CAP_DEVICE_ASSIGNMENT
679 int kvm_assign_pci_device(kvm_context_t kvm,
680 struct kvm_assigned_pci_dev *assigned_dev)
682 return kvm_vm_ioctl(kvm_state, KVM_ASSIGN_PCI_DEVICE, assigned_dev);
685 static int kvm_old_assign_irq(kvm_context_t kvm,
686 struct kvm_assigned_irq *assigned_irq)
688 return kvm_vm_ioctl(kvm_state, KVM_ASSIGN_IRQ, assigned_irq);
691 #ifdef KVM_CAP_ASSIGN_DEV_IRQ
692 int kvm_assign_irq(kvm_context_t kvm, struct kvm_assigned_irq *assigned_irq)
694 int ret;
696 ret = kvm_ioctl(kvm_state, KVM_CHECK_EXTENSION, KVM_CAP_ASSIGN_DEV_IRQ);
697 if (ret > 0) {
698 return kvm_vm_ioctl(kvm_state, KVM_ASSIGN_DEV_IRQ, assigned_irq);
701 return kvm_old_assign_irq(kvm, assigned_irq);
704 int kvm_deassign_irq(kvm_context_t kvm, struct kvm_assigned_irq *assigned_irq)
706 return kvm_vm_ioctl(kvm_state, KVM_DEASSIGN_DEV_IRQ, assigned_irq);
708 #else
709 int kvm_assign_irq(kvm_context_t kvm, struct kvm_assigned_irq *assigned_irq)
711 return kvm_old_assign_irq(kvm, assigned_irq);
713 #endif
714 #endif
716 #ifdef KVM_CAP_DEVICE_DEASSIGNMENT
717 int kvm_deassign_pci_device(kvm_context_t kvm,
718 struct kvm_assigned_pci_dev *assigned_dev)
720 return kvm_vm_ioctl(kvm_state, KVM_DEASSIGN_PCI_DEVICE, assigned_dev);
722 #endif
724 int kvm_reinject_control(kvm_context_t kvm, int pit_reinject)
726 #ifdef KVM_CAP_REINJECT_CONTROL
727 int r;
728 struct kvm_reinject_control control;
730 control.pit_reinject = pit_reinject;
732 r = kvm_ioctl(kvm_state, KVM_CHECK_EXTENSION, KVM_CAP_REINJECT_CONTROL);
733 if (r > 0) {
734 return kvm_vm_ioctl(kvm_state, KVM_REINJECT_CONTROL, &control);
736 #endif
737 return -ENOSYS;
740 int kvm_has_gsi_routing(kvm_context_t kvm)
742 int r = 0;
744 #ifdef KVM_CAP_IRQ_ROUTING
745 r = kvm_check_extension(kvm_state, KVM_CAP_IRQ_ROUTING);
746 #endif
747 return r;
750 int kvm_get_gsi_count(kvm_context_t kvm)
752 #ifdef KVM_CAP_IRQ_ROUTING
753 return kvm_check_extension(kvm_state, KVM_CAP_IRQ_ROUTING);
754 #else
755 return -EINVAL;
756 #endif
759 int kvm_clear_gsi_routes(kvm_context_t kvm)
761 #ifdef KVM_CAP_IRQ_ROUTING
762 kvm->irq_routes->nr = 0;
763 return 0;
764 #else
765 return -EINVAL;
766 #endif
769 int kvm_add_routing_entry(kvm_context_t kvm,
770 struct kvm_irq_routing_entry *entry)
772 #ifdef KVM_CAP_IRQ_ROUTING
773 struct kvm_irq_routing *z;
774 struct kvm_irq_routing_entry *new;
775 int n, size;
777 if (kvm->irq_routes->nr == kvm->nr_allocated_irq_routes) {
778 n = kvm->nr_allocated_irq_routes * 2;
779 if (n < 64)
780 n = 64;
781 size = sizeof(struct kvm_irq_routing);
782 size += n * sizeof(*new);
783 z = realloc(kvm->irq_routes, size);
784 if (!z)
785 return -ENOMEM;
786 kvm->nr_allocated_irq_routes = n;
787 kvm->irq_routes = z;
789 n = kvm->irq_routes->nr++;
790 new = &kvm->irq_routes->entries[n];
791 memset(new, 0, sizeof(*new));
792 new->gsi = entry->gsi;
793 new->type = entry->type;
794 new->flags = entry->flags;
795 new->u = entry->u;
797 set_gsi(kvm, entry->gsi);
799 return 0;
800 #else
801 return -ENOSYS;
802 #endif
805 int kvm_add_irq_route(kvm_context_t kvm, int gsi, int irqchip, int pin)
807 #ifdef KVM_CAP_IRQ_ROUTING
808 struct kvm_irq_routing_entry e;
810 e.gsi = gsi;
811 e.type = KVM_IRQ_ROUTING_IRQCHIP;
812 e.flags = 0;
813 e.u.irqchip.irqchip = irqchip;
814 e.u.irqchip.pin = pin;
815 return kvm_add_routing_entry(kvm, &e);
816 #else
817 return -ENOSYS;
818 #endif
821 int kvm_del_routing_entry(kvm_context_t kvm,
822 struct kvm_irq_routing_entry *entry)
824 #ifdef KVM_CAP_IRQ_ROUTING
825 struct kvm_irq_routing_entry *e, *p;
826 int i, gsi, found = 0;
828 gsi = entry->gsi;
830 for (i = 0; i < kvm->irq_routes->nr; ++i) {
831 e = &kvm->irq_routes->entries[i];
832 if (e->type == entry->type && e->gsi == gsi) {
833 switch (e->type) {
834 case KVM_IRQ_ROUTING_IRQCHIP:{
835 if (e->u.irqchip.irqchip ==
836 entry->u.irqchip.irqchip
837 && e->u.irqchip.pin == entry->u.irqchip.pin) {
838 p = &kvm->irq_routes->entries[--kvm->irq_routes->nr];
839 *e = *p;
840 found = 1;
842 break;
844 case KVM_IRQ_ROUTING_MSI:{
845 if (e->u.msi.address_lo ==
846 entry->u.msi.address_lo
847 && e->u.msi.address_hi ==
848 entry->u.msi.address_hi
849 && e->u.msi.data == entry->u.msi.data) {
850 p = &kvm->irq_routes->entries[--kvm->irq_routes->nr];
851 *e = *p;
852 found = 1;
854 break;
856 default:
857 break;
859 if (found) {
860 /* If there are no other users of this GSI
861 * mark it available in the bitmap */
862 for (i = 0; i < kvm->irq_routes->nr; i++) {
863 e = &kvm->irq_routes->entries[i];
864 if (e->gsi == gsi)
865 break;
867 if (i == kvm->irq_routes->nr)
868 clear_gsi(kvm, gsi);
870 return 0;
874 return -ESRCH;
875 #else
876 return -ENOSYS;
877 #endif
880 int kvm_update_routing_entry(kvm_context_t kvm,
881 struct kvm_irq_routing_entry *entry,
882 struct kvm_irq_routing_entry *newentry)
884 #ifdef KVM_CAP_IRQ_ROUTING
885 struct kvm_irq_routing_entry *e;
886 int i;
888 if (entry->gsi != newentry->gsi || entry->type != newentry->type) {
889 return -EINVAL;
892 for (i = 0; i < kvm->irq_routes->nr; ++i) {
893 e = &kvm->irq_routes->entries[i];
894 if (e->type != entry->type || e->gsi != entry->gsi) {
895 continue;
897 switch (e->type) {
898 case KVM_IRQ_ROUTING_IRQCHIP:
899 if (e->u.irqchip.irqchip == entry->u.irqchip.irqchip &&
900 e->u.irqchip.pin == entry->u.irqchip.pin) {
901 memcpy(&e->u.irqchip, &newentry->u.irqchip,
902 sizeof e->u.irqchip);
903 return 0;
905 break;
906 case KVM_IRQ_ROUTING_MSI:
907 if (e->u.msi.address_lo == entry->u.msi.address_lo &&
908 e->u.msi.address_hi == entry->u.msi.address_hi &&
909 e->u.msi.data == entry->u.msi.data) {
910 memcpy(&e->u.msi, &newentry->u.msi, sizeof e->u.msi);
911 return 0;
913 break;
914 default:
915 break;
918 return -ESRCH;
919 #else
920 return -ENOSYS;
921 #endif
924 int kvm_del_irq_route(kvm_context_t kvm, int gsi, int irqchip, int pin)
926 #ifdef KVM_CAP_IRQ_ROUTING
927 struct kvm_irq_routing_entry e;
929 e.gsi = gsi;
930 e.type = KVM_IRQ_ROUTING_IRQCHIP;
931 e.flags = 0;
932 e.u.irqchip.irqchip = irqchip;
933 e.u.irqchip.pin = pin;
934 return kvm_del_routing_entry(kvm, &e);
935 #else
936 return -ENOSYS;
937 #endif
940 int kvm_commit_irq_routes(kvm_context_t kvm)
942 #ifdef KVM_CAP_IRQ_ROUTING
943 kvm->irq_routes->flags = 0;
944 return kvm_vm_ioctl(kvm_state, KVM_SET_GSI_ROUTING, kvm->irq_routes);
945 #else
946 return -ENOSYS;
947 #endif
950 int kvm_get_irq_route_gsi(kvm_context_t kvm)
952 int i, bit;
953 uint32_t *buf = kvm->used_gsi_bitmap;
955 /* Return the lowest unused GSI in the bitmap */
956 for (i = 0; i < kvm->max_gsi / 32; i++) {
957 bit = ffs(~buf[i]);
958 if (!bit)
959 continue;
961 return bit - 1 + i * 32;
964 return -ENOSPC;
967 #ifdef KVM_CAP_DEVICE_MSIX
968 int kvm_assign_set_msix_nr(kvm_context_t kvm,
969 struct kvm_assigned_msix_nr *msix_nr)
971 return kvm_vm_ioctl(kvm_state, KVM_ASSIGN_SET_MSIX_NR, msix_nr);
974 int kvm_assign_set_msix_entry(kvm_context_t kvm,
975 struct kvm_assigned_msix_entry *entry)
977 return kvm_vm_ioctl(kvm_state, KVM_ASSIGN_SET_MSIX_ENTRY, entry);
979 #endif
981 #if defined(KVM_CAP_IRQFD) && defined(CONFIG_EVENTFD)
983 #include <sys/eventfd.h>
985 static int _kvm_irqfd(kvm_context_t kvm, int fd, int gsi, int flags)
987 struct kvm_irqfd data = {
988 .fd = fd,
989 .gsi = gsi,
990 .flags = flags,
993 return kvm_vm_ioctl(kvm_state, KVM_IRQFD, &data);
996 int kvm_irqfd(kvm_context_t kvm, int gsi, int flags)
998 int r;
999 int fd;
1001 if (!kvm_check_extension(kvm_state, KVM_CAP_IRQFD))
1002 return -ENOENT;
1004 fd = eventfd(0, 0);
1005 if (fd < 0)
1006 return -errno;
1008 r = _kvm_irqfd(kvm, fd, gsi, 0);
1009 if (r < 0) {
1010 close(fd);
1011 return -errno;
1014 return fd;
1017 #else /* KVM_CAP_IRQFD */
1019 int kvm_irqfd(kvm_context_t kvm, int gsi, int flags)
1021 return -ENOSYS;
1024 #endif /* KVM_CAP_IRQFD */
1025 unsigned long kvm_get_thread_id(void)
1027 return syscall(SYS_gettid);
1030 static void qemu_cond_wait(pthread_cond_t *cond)
1032 CPUState *env = cpu_single_env;
1034 pthread_cond_wait(cond, &qemu_mutex);
1035 cpu_single_env = env;
1038 static void sig_ipi_handler(int n)
1042 static void hardware_memory_error(void)
1044 fprintf(stderr, "Hardware memory error!\n");
1045 exit(1);
1048 static void sigbus_reraise(void)
1050 sigset_t set;
1051 struct sigaction action;
1053 memset(&action, 0, sizeof(action));
1054 action.sa_handler = SIG_DFL;
1055 if (!sigaction(SIGBUS, &action, NULL)) {
1056 raise(SIGBUS);
1057 sigemptyset(&set);
1058 sigaddset(&set, SIGBUS);
1059 sigprocmask(SIG_UNBLOCK, &set, NULL);
1061 perror("Failed to re-raise SIGBUS!\n");
1062 abort();
1065 static void sigbus_handler(int n, struct qemu_signalfd_siginfo *siginfo,
1066 void *ctx)
1068 #if defined(KVM_CAP_MCE) && defined(TARGET_I386)
1069 if (first_cpu->mcg_cap && siginfo->ssi_addr
1070 && siginfo->ssi_code == BUS_MCEERR_AO) {
1071 uint64_t status;
1072 unsigned long paddr;
1073 CPUState *cenv;
1075 /* Hope we are lucky for AO MCE */
1076 if (do_qemu_ram_addr_from_host((void *)(intptr_t)siginfo->ssi_addr,
1077 &paddr)) {
1078 fprintf(stderr, "Hardware memory error for memory used by "
1079 "QEMU itself instead of guest system!: %llx\n",
1080 (unsigned long long)siginfo->ssi_addr);
1081 return;
1083 status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN
1084 | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S
1085 | 0xc0;
1086 kvm_inject_x86_mce(first_cpu, 9, status,
1087 MCG_STATUS_MCIP | MCG_STATUS_RIPV, paddr,
1088 (MCM_ADDR_PHYS << 6) | 0xc, 1);
1089 for (cenv = first_cpu->next_cpu; cenv != NULL; cenv = cenv->next_cpu)
1090 kvm_inject_x86_mce(cenv, 1, MCI_STATUS_VAL | MCI_STATUS_UC,
1091 MCG_STATUS_MCIP | MCG_STATUS_RIPV, 0, 0, 1);
1092 } else
1093 #endif
1095 if (siginfo->ssi_code == BUS_MCEERR_AO)
1096 return;
1097 else if (siginfo->ssi_code == BUS_MCEERR_AR)
1098 hardware_memory_error();
1099 else
1100 sigbus_reraise();
1104 static void on_vcpu(CPUState *env, void (*func)(void *data), void *data)
1106 struct qemu_work_item wi;
1108 if (env == current_env) {
1109 func(data);
1110 return;
1113 wi.func = func;
1114 wi.data = data;
1115 if (!env->kvm_cpu_state.queued_work_first)
1116 env->kvm_cpu_state.queued_work_first = &wi;
1117 else
1118 env->kvm_cpu_state.queued_work_last->next = &wi;
1119 env->kvm_cpu_state.queued_work_last = &wi;
1120 wi.next = NULL;
1121 wi.done = false;
1123 pthread_kill(env->kvm_cpu_state.thread, SIG_IPI);
1124 while (!wi.done)
1125 qemu_cond_wait(&qemu_work_cond);
1128 static void do_kvm_cpu_synchronize_state(void *_env)
1130 CPUState *env = _env;
1132 if (!env->kvm_vcpu_dirty) {
1133 kvm_arch_save_regs(env);
1134 env->kvm_vcpu_dirty = 1;
1138 void kvm_cpu_synchronize_state(CPUState *env)
1140 if (!env->kvm_vcpu_dirty)
1141 on_vcpu(env, do_kvm_cpu_synchronize_state, env);
1144 void kvm_cpu_synchronize_post_reset(CPUState *env)
1146 kvm_arch_load_regs(env, KVM_PUT_RESET_STATE);
1147 env->kvm_vcpu_dirty = 0;
1150 void kvm_cpu_synchronize_post_init(CPUState *env)
1152 kvm_arch_load_regs(env, KVM_PUT_FULL_STATE);
1153 env->kvm_vcpu_dirty = 0;
1156 static void inject_interrupt(void *data)
1158 cpu_interrupt(current_env, (long) data);
1161 void kvm_inject_interrupt(CPUState *env, int mask)
1163 on_vcpu(env, inject_interrupt, (void *) (long) mask);
1166 void kvm_update_interrupt_request(CPUState *env)
1168 int signal = 0;
1170 if (env) {
1171 if (!current_env || !current_env->created)
1172 signal = 1;
1174 * Testing for created here is really redundant
1176 if (current_env && current_env->created &&
1177 env != current_env && !env->kvm_cpu_state.signalled)
1178 signal = 1;
1180 if (signal) {
1181 env->kvm_cpu_state.signalled = 1;
1182 if (env->kvm_cpu_state.thread)
1183 pthread_kill(env->kvm_cpu_state.thread, SIG_IPI);
1188 int kvm_cpu_exec(CPUState *env)
1190 int r;
1192 r = kvm_run(env);
1193 if (r < 0) {
1194 printf("kvm_run returned %d\n", r);
1195 vm_stop(0);
1198 return 0;
1201 int kvm_cpu_is_stopped(CPUState *env)
1203 return !vm_running || env->stopped;
1206 static void flush_queued_work(CPUState *env)
1208 struct qemu_work_item *wi;
1210 if (!env->kvm_cpu_state.queued_work_first)
1211 return;
1213 while ((wi = env->kvm_cpu_state.queued_work_first)) {
1214 env->kvm_cpu_state.queued_work_first = wi->next;
1215 wi->func(wi->data);
1216 wi->done = true;
1218 env->kvm_cpu_state.queued_work_last = NULL;
1219 pthread_cond_broadcast(&qemu_work_cond);
1222 static int kvm_mce_in_exception(CPUState *env)
1224 struct kvm_msr_entry msr_mcg_status = {
1225 .index = MSR_MCG_STATUS,
1227 int r;
1229 r = kvm_get_msrs(env, &msr_mcg_status, 1);
1230 if (r == -1 || r == 0)
1231 return -1;
1232 return !!(msr_mcg_status.data & MCG_STATUS_MCIP);
1235 static void kvm_on_sigbus(CPUState *env, siginfo_t *siginfo)
1237 #if defined(KVM_CAP_MCE) && defined(TARGET_I386)
1238 struct kvm_x86_mce mce = {
1239 .bank = 9,
1241 unsigned long paddr;
1242 int r;
1244 if (env->mcg_cap && siginfo->si_addr
1245 && (siginfo->si_code == BUS_MCEERR_AR
1246 || siginfo->si_code == BUS_MCEERR_AO)) {
1247 if (siginfo->si_code == BUS_MCEERR_AR) {
1248 /* Fake an Intel architectural Data Load SRAR UCR */
1249 mce.status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN
1250 | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S
1251 | MCI_STATUS_AR | 0x134;
1252 mce.misc = (MCM_ADDR_PHYS << 6) | 0xc;
1253 mce.mcg_status = MCG_STATUS_MCIP | MCG_STATUS_EIPV;
1254 } else {
1256 * If there is an MCE excpetion being processed, ignore
1257 * this SRAO MCE
1259 r = kvm_mce_in_exception(env);
1260 if (r == -1)
1261 fprintf(stderr, "Failed to get MCE status\n");
1262 else if (r)
1263 return;
1264 /* Fake an Intel architectural Memory scrubbing UCR */
1265 mce.status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN
1266 | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S
1267 | 0xc0;
1268 mce.misc = (MCM_ADDR_PHYS << 6) | 0xc;
1269 mce.mcg_status = MCG_STATUS_MCIP | MCG_STATUS_RIPV;
1271 if (do_qemu_ram_addr_from_host((void *)siginfo->si_addr, &paddr)) {
1272 fprintf(stderr, "Hardware memory error for memory used by "
1273 "QEMU itself instaed of guest system!\n");
1274 /* Hope we are lucky for AO MCE */
1275 if (siginfo->si_code == BUS_MCEERR_AO)
1276 return;
1277 else
1278 hardware_memory_error();
1280 mce.addr = paddr;
1281 r = kvm_set_mce(env, &mce);
1282 if (r < 0) {
1283 fprintf(stderr, "kvm_set_mce: %s\n", strerror(errno));
1284 abort();
1286 } else
1287 #endif
1289 if (siginfo->si_code == BUS_MCEERR_AO)
1290 return;
1291 else if (siginfo->si_code == BUS_MCEERR_AR)
1292 hardware_memory_error();
1293 else
1294 sigbus_reraise();
1298 static void kvm_main_loop_wait(CPUState *env, int timeout)
1300 struct timespec ts;
1301 int r, e;
1302 siginfo_t siginfo;
1303 sigset_t waitset;
1304 sigset_t chkset;
1306 ts.tv_sec = timeout / 1000;
1307 ts.tv_nsec = (timeout % 1000) * 1000000;
1308 sigemptyset(&waitset);
1309 sigaddset(&waitset, SIG_IPI);
1310 sigaddset(&waitset, SIGBUS);
1312 do {
1313 pthread_mutex_unlock(&qemu_mutex);
1315 r = sigtimedwait(&waitset, &siginfo, &ts);
1316 e = errno;
1318 pthread_mutex_lock(&qemu_mutex);
1320 if (r == -1 && !(e == EAGAIN || e == EINTR)) {
1321 printf("sigtimedwait: %s\n", strerror(e));
1322 exit(1);
1325 switch (r) {
1326 case SIGBUS:
1327 kvm_on_sigbus(env, &siginfo);
1328 break;
1329 default:
1330 break;
1333 r = sigpending(&chkset);
1334 if (r == -1) {
1335 printf("sigpending: %s\n", strerror(e));
1336 exit(1);
1338 } while (sigismember(&chkset, SIG_IPI) || sigismember(&chkset, SIGBUS));
1340 cpu_single_env = env;
1341 flush_queued_work(env);
1343 if (env->stop) {
1344 env->stop = 0;
1345 env->stopped = 1;
1346 pthread_cond_signal(&qemu_pause_cond);
1349 env->kvm_cpu_state.signalled = 0;
1352 static int all_threads_paused(void)
1354 CPUState *penv = first_cpu;
1356 while (penv) {
1357 if (penv->stop)
1358 return 0;
1359 penv = (CPUState *) penv->next_cpu;
1362 return 1;
1365 static void pause_all_threads(void)
1367 CPUState *penv = first_cpu;
1369 while (penv) {
1370 if (penv != cpu_single_env) {
1371 penv->stop = 1;
1372 pthread_kill(penv->kvm_cpu_state.thread, SIG_IPI);
1373 } else {
1374 penv->stop = 0;
1375 penv->stopped = 1;
1376 cpu_exit(penv);
1378 penv = (CPUState *) penv->next_cpu;
1381 while (!all_threads_paused())
1382 qemu_cond_wait(&qemu_pause_cond);
1385 static void resume_all_threads(void)
1387 CPUState *penv = first_cpu;
1389 assert(!cpu_single_env);
1391 while (penv) {
1392 penv->stop = 0;
1393 penv->stopped = 0;
1394 pthread_kill(penv->kvm_cpu_state.thread, SIG_IPI);
1395 penv = (CPUState *) penv->next_cpu;
1399 static void kvm_vm_state_change_handler(void *context, int running, int reason)
1401 if (running)
1402 resume_all_threads();
1403 else
1404 pause_all_threads();
1407 static void setup_kernel_sigmask(CPUState *env)
1409 sigset_t set;
1411 sigemptyset(&set);
1412 sigaddset(&set, SIGUSR2);
1413 sigaddset(&set, SIGIO);
1414 sigaddset(&set, SIGALRM);
1415 sigprocmask(SIG_BLOCK, &set, NULL);
1417 sigprocmask(SIG_BLOCK, NULL, &set);
1418 sigdelset(&set, SIG_IPI);
1419 sigdelset(&set, SIGBUS);
1421 kvm_set_signal_mask(env, &set);
1424 static void qemu_kvm_system_reset(void)
1426 pause_all_threads();
1428 qemu_system_reset();
1430 resume_all_threads();
1433 static void process_irqchip_events(CPUState *env)
1435 kvm_arch_process_irqchip_events(env);
1436 if (kvm_arch_has_work(env))
1437 env->halted = 0;
1440 static int kvm_main_loop_cpu(CPUState *env)
1442 while (1) {
1443 int run_cpu = !kvm_cpu_is_stopped(env);
1444 if (run_cpu && !kvm_irqchip_in_kernel()) {
1445 process_irqchip_events(env);
1446 run_cpu = !env->halted;
1448 if (run_cpu) {
1449 kvm_cpu_exec(env);
1450 kvm_main_loop_wait(env, 0);
1451 } else {
1452 kvm_main_loop_wait(env, 1000);
1455 pthread_mutex_unlock(&qemu_mutex);
1456 return 0;
1459 static void *ap_main_loop(void *_env)
1461 CPUState *env = _env;
1462 sigset_t signals;
1463 #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
1464 struct ioperm_data *data = NULL;
1465 #endif
1467 current_env = env;
1468 env->thread_id = kvm_get_thread_id();
1469 sigfillset(&signals);
1470 sigprocmask(SIG_BLOCK, &signals, NULL);
1472 #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
1473 /* do ioperm for io ports of assigned devices */
1474 QLIST_FOREACH(data, &ioperm_head, entries)
1475 on_vcpu(env, kvm_arch_do_ioperm, data);
1476 #endif
1478 pthread_mutex_lock(&qemu_mutex);
1479 cpu_single_env = env;
1481 kvm_create_vcpu(env, env->cpu_index);
1482 setup_kernel_sigmask(env);
1484 /* signal VCPU creation */
1485 current_env->created = 1;
1486 pthread_cond_signal(&qemu_vcpu_cond);
1488 /* and wait for machine initialization */
1489 while (!qemu_system_ready)
1490 qemu_cond_wait(&qemu_system_cond);
1492 /* re-initialize cpu_single_env after re-acquiring qemu_mutex */
1493 cpu_single_env = env;
1495 kvm_main_loop_cpu(env);
1496 return NULL;
1499 int kvm_init_vcpu(CPUState *env)
1501 pthread_create(&env->kvm_cpu_state.thread, NULL, ap_main_loop, env);
1503 while (env->created == 0)
1504 qemu_cond_wait(&qemu_vcpu_cond);
1506 return 0;
1509 int kvm_vcpu_inited(CPUState *env)
1511 return env->created;
1514 #ifdef TARGET_I386
1515 void kvm_hpet_disable_kpit(void)
1517 struct kvm_pit_state2 ps2;
1519 kvm_get_pit2(kvm_context, &ps2);
1520 ps2.flags |= KVM_PIT_FLAGS_HPET_LEGACY;
1521 kvm_set_pit2(kvm_context, &ps2);
1524 void kvm_hpet_enable_kpit(void)
1526 struct kvm_pit_state2 ps2;
1528 kvm_get_pit2(kvm_context, &ps2);
1529 ps2.flags &= ~KVM_PIT_FLAGS_HPET_LEGACY;
1530 kvm_set_pit2(kvm_context, &ps2);
1532 #endif
1534 int kvm_init_ap(void)
1536 struct sigaction action;
1538 qemu_add_vm_change_state_handler(kvm_vm_state_change_handler, NULL);
1540 signal(SIG_IPI, sig_ipi_handler);
1542 memset(&action, 0, sizeof(action));
1543 action.sa_flags = SA_SIGINFO;
1544 action.sa_sigaction = (void (*)(int, siginfo_t*, void*))sigbus_handler;
1545 sigaction(SIGBUS, &action, NULL);
1546 prctl(PR_MCE_KILL, 1, 1, 0, 0);
1547 return 0;
1550 void qemu_kvm_notify_work(void)
1552 /* Write 8 bytes to be compatible with eventfd. */
1553 static uint64_t val = 1;
1554 ssize_t ret;
1556 if (io_thread_fd == -1)
1557 return;
1559 do {
1560 ret = write(io_thread_fd, &val, sizeof(val));
1561 } while (ret < 0 && errno == EINTR);
1563 /* EAGAIN is fine in case we have a pipe. */
1564 if (ret < 0 && errno != EAGAIN) {
1565 fprintf(stderr, "qemu_kvm_notify_work: write() filed: %s\n",
1566 strerror(errno));
1567 exit (1);
1571 /* If we have signalfd, we mask out the signals we want to handle and then
1572 * use signalfd to listen for them. We rely on whatever the current signal
1573 * handler is to dispatch the signals when we receive them.
1576 static void sigfd_handler(void *opaque)
1578 int fd = (unsigned long) opaque;
1579 struct qemu_signalfd_siginfo info;
1580 struct sigaction action;
1581 ssize_t len;
1583 while (1) {
1584 do {
1585 len = read(fd, &info, sizeof(info));
1586 } while (len == -1 && errno == EINTR);
1588 if (len == -1 && errno == EAGAIN)
1589 break;
1591 if (len != sizeof(info)) {
1592 printf("read from sigfd returned %zd: %m\n", len);
1593 return;
1596 sigaction(info.ssi_signo, NULL, &action);
1597 if ((action.sa_flags & SA_SIGINFO) && action.sa_sigaction)
1598 action.sa_sigaction(info.ssi_signo,
1599 (siginfo_t *)&info, NULL);
1600 else if (action.sa_handler)
1601 action.sa_handler(info.ssi_signo);
1606 /* Used to break IO thread out of select */
1607 static void io_thread_wakeup(void *opaque)
1609 int fd = (unsigned long) opaque;
1610 ssize_t len;
1611 char buffer[512];
1613 /* Drain the notify pipe. For eventfd, only 8 bytes will be read. */
1614 do {
1615 len = read(fd, buffer, sizeof(buffer));
1616 } while ((len == -1 && errno == EINTR) || len == sizeof(buffer));
1619 int kvm_main_loop(void)
1621 int fds[2];
1622 sigset_t mask;
1623 int sigfd;
1625 io_thread = pthread_self();
1626 qemu_system_ready = 1;
1628 if (qemu_eventfd(fds) == -1) {
1629 fprintf(stderr, "failed to create eventfd\n");
1630 return -errno;
1633 fcntl(fds[0], F_SETFL, O_NONBLOCK);
1634 fcntl(fds[1], F_SETFL, O_NONBLOCK);
1636 qemu_set_fd_handler2(fds[0], NULL, io_thread_wakeup, NULL,
1637 (void *)(unsigned long) fds[0]);
1639 io_thread_fd = fds[1];
1641 sigemptyset(&mask);
1642 sigaddset(&mask, SIGIO);
1643 sigaddset(&mask, SIGALRM);
1644 sigaddset(&mask, SIGBUS);
1645 sigprocmask(SIG_BLOCK, &mask, NULL);
1647 sigfd = qemu_signalfd(&mask);
1648 if (sigfd == -1) {
1649 fprintf(stderr, "failed to create signalfd\n");
1650 return -errno;
1653 fcntl(sigfd, F_SETFL, O_NONBLOCK);
1655 qemu_set_fd_handler2(sigfd, NULL, sigfd_handler, NULL,
1656 (void *)(unsigned long) sigfd);
1658 pthread_cond_broadcast(&qemu_system_cond);
1660 io_thread_sigfd = sigfd;
1661 cpu_single_env = NULL;
1663 while (1) {
1664 main_loop_wait(0);
1665 if (qemu_shutdown_requested()) {
1666 monitor_protocol_event(QEVENT_SHUTDOWN, NULL);
1667 if (qemu_no_shutdown()) {
1668 vm_stop(0);
1669 } else
1670 break;
1671 } else if (qemu_powerdown_requested()) {
1672 monitor_protocol_event(QEVENT_POWERDOWN, NULL);
1673 qemu_irq_raise(qemu_system_powerdown);
1674 } else if (qemu_reset_requested()) {
1675 qemu_kvm_system_reset();
1676 } else if (kvm_debug_cpu_requested) {
1677 gdb_set_stop_cpu(kvm_debug_cpu_requested);
1678 vm_stop(EXCP_DEBUG);
1679 kvm_debug_cpu_requested = NULL;
1681 if (qemu_exit_requested()) {
1682 exit(0);
1686 pause_all_threads();
1687 pthread_mutex_unlock(&qemu_mutex);
1689 return 0;
1692 #if !defined(TARGET_I386)
1693 int kvm_arch_init_irq_routing(void)
1695 return 0;
1697 #endif
1699 extern int no_hpet;
1701 static int kvm_create_context(void)
1703 static const char upgrade_note[] =
1704 "Please upgrade to at least kernel 2.6.29 or recent kvm-kmod\n"
1705 "(see http://sourceforge.net/projects/kvm).\n";
1707 int r;
1709 if (!kvm_irqchip) {
1710 kvm_disable_irqchip_creation(kvm_context);
1712 if (!kvm_pit) {
1713 kvm_disable_pit_creation(kvm_context);
1715 if (kvm_create(kvm_context, 0, NULL) < 0) {
1716 kvm_finalize(kvm_state);
1717 return -1;
1719 r = kvm_arch_qemu_create_context();
1720 if (r < 0) {
1721 kvm_finalize(kvm_state);
1722 return -1;
1724 if (kvm_pit && !kvm_pit_reinject) {
1725 if (kvm_reinject_control(kvm_context, 0)) {
1726 fprintf(stderr, "failure to disable in-kernel PIT reinjection\n");
1727 return -1;
1731 /* There was a nasty bug in < kvm-80 that prevents memory slots from being
1732 * destroyed properly. Since we rely on this capability, refuse to work
1733 * with any kernel without this capability. */
1734 if (!kvm_check_extension(kvm_state, KVM_CAP_DESTROY_MEMORY_REGION_WORKS)) {
1735 fprintf(stderr,
1736 "KVM kernel module broken (DESTROY_MEMORY_REGION).\n%s",
1737 upgrade_note);
1738 return -EINVAL;
1741 r = kvm_arch_init_irq_routing();
1742 if (r < 0) {
1743 return r;
1746 kvm_state->vcpu_events = 0;
1747 #ifdef KVM_CAP_VCPU_EVENTS
1748 kvm_state->vcpu_events = kvm_check_extension(kvm_state, KVM_CAP_VCPU_EVENTS);
1749 #endif
1751 kvm_state->debugregs = 0;
1752 #ifdef KVM_CAP_DEBUGREGS
1753 kvm_state->debugregs = kvm_check_extension(kvm_state, KVM_CAP_DEBUGREGS);
1754 #endif
1756 kvm_init_ap();
1757 if (kvm_irqchip) {
1758 if (!qemu_kvm_has_gsi_routing()) {
1759 irq0override = 0;
1760 #ifdef TARGET_I386
1761 /* if kernel can't do irq routing, interrupt source
1762 * override 0->2 can not be set up as required by hpet,
1763 * so disable hpet.
1765 no_hpet = 1;
1766 } else if (!qemu_kvm_has_pit_state2()) {
1767 no_hpet = 1;
1769 #else
1771 #endif
1774 return 0;
1777 #ifdef KVM_CAP_IRQCHIP
1779 int kvm_set_irq(int irq, int level, int *status)
1781 return kvm_set_irq_level(kvm_context, irq, level, status);
1784 #endif
1786 void kvm_mutex_unlock(void)
1788 assert(!cpu_single_env);
1789 pthread_mutex_unlock(&qemu_mutex);
1792 void kvm_mutex_lock(void)
1794 pthread_mutex_lock(&qemu_mutex);
1795 cpu_single_env = NULL;
1798 void qemu_mutex_unlock_iothread(void)
1800 if (kvm_enabled())
1801 kvm_mutex_unlock();
1804 void qemu_mutex_lock_iothread(void)
1806 if (kvm_enabled())
1807 kvm_mutex_lock();
1810 #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
1811 void kvm_add_ioperm_data(struct ioperm_data *data)
1813 QLIST_INSERT_HEAD(&ioperm_head, data, entries);
1816 void kvm_remove_ioperm_data(unsigned long start_port, unsigned long num)
1818 struct ioperm_data *data;
1820 data = QLIST_FIRST(&ioperm_head);
1821 while (data) {
1822 struct ioperm_data *next = QLIST_NEXT(data, entries);
1824 if (data->start_port == start_port && data->num == num) {
1825 QLIST_REMOVE(data, entries);
1826 qemu_free(data);
1829 data = next;
1833 void kvm_ioperm(CPUState *env, void *data)
1835 if (kvm_enabled() && qemu_system_ready)
1836 on_vcpu(env, kvm_arch_do_ioperm, data);
1839 #endif
1841 int kvm_set_boot_cpu_id(uint32_t id)
1843 return kvm_set_boot_vcpu_id(kvm_context, id);
1846 #ifdef TARGET_I386
1847 #ifdef KVM_CAP_MCE
1848 struct kvm_x86_mce_data {
1849 CPUState *env;
1850 struct kvm_x86_mce *mce;
1851 int abort_on_error;
1854 static void kvm_do_inject_x86_mce(void *_data)
1856 struct kvm_x86_mce_data *data = _data;
1857 int r;
1859 /* If there is an MCE excpetion being processed, ignore this SRAO MCE */
1860 r = kvm_mce_in_exception(data->env);
1861 if (r == -1)
1862 fprintf(stderr, "Failed to get MCE status\n");
1863 else if (r && !(data->mce->status & MCI_STATUS_AR))
1864 return;
1865 r = kvm_set_mce(data->env, data->mce);
1866 if (r < 0) {
1867 perror("kvm_set_mce FAILED");
1868 if (data->abort_on_error)
1869 abort();
1872 #endif
1874 void kvm_inject_x86_mce(CPUState *cenv, int bank, uint64_t status,
1875 uint64_t mcg_status, uint64_t addr, uint64_t misc,
1876 int abort_on_error)
1878 #ifdef KVM_CAP_MCE
1879 struct kvm_x86_mce mce = {
1880 .bank = bank,
1881 .status = status,
1882 .mcg_status = mcg_status,
1883 .addr = addr,
1884 .misc = misc,
1886 struct kvm_x86_mce_data data = {
1887 .env = cenv,
1888 .mce = &mce,
1889 .abort_on_error = abort_on_error,
1892 if (!cenv->mcg_cap) {
1893 fprintf(stderr, "MCE support is not enabled!\n");
1894 return;
1896 on_vcpu(cenv, kvm_do_inject_x86_mce, &data);
1897 #else
1898 if (abort_on_error)
1899 abort();
1900 #endif
1902 #endif