Merge commit '64e07be544ee9c5fb5b741175262fd34726ec431' into upstream-merge
[qemu/qemu-dev-zwu.git] / qemu-kvm.c
blobcf02cebcd2d4349e4452ff574478d15935730d79
1 /*
2 * qemu/kvm integration
4 * Copyright (C) 2006-2008 Qumranet Technologies
6 * Licensed under the terms of the GNU GPL version 2 or higher.
7 */
8 #include "config.h"
9 #include "config-host.h"
11 #include <assert.h>
12 #include <string.h>
13 #include "hw/hw.h"
14 #include "sysemu.h"
15 #include "qemu-common.h"
16 #include "console.h"
17 #include "block.h"
18 #include "compatfd.h"
19 #include "gdbstub.h"
20 #include "monitor.h"
21 #include "cpus.h"
22 #include "qemu-thread.h"
24 #include "qemu-kvm.h"
26 #include <pthread.h>
27 #include <sys/utsname.h>
28 #include <sys/syscall.h>
29 #include <sys/mman.h>
30 #include <sys/ioctl.h>
31 #include "compatfd.h"
32 #include <sys/prctl.h>
34 #ifndef PR_MCE_KILL
35 #define PR_MCE_KILL 33
36 #endif
38 #ifndef BUS_MCEERR_AR
39 #define BUS_MCEERR_AR 4
40 #endif
41 #ifndef BUS_MCEERR_AO
42 #define BUS_MCEERR_AO 5
43 #endif
45 #define EXPECTED_KVM_API_VERSION 12
47 #if EXPECTED_KVM_API_VERSION != KVM_API_VERSION
48 #error libkvm: userspace and kernel version mismatch
49 #endif
51 int kvm_irqchip = 1;
52 int kvm_pit = 1;
53 int kvm_pit_reinject = 1;
54 int kvm_nested = 0;
57 KVMState *kvm_state;
58 kvm_context_t kvm_context;
60 pthread_mutex_t qemu_mutex = PTHREAD_MUTEX_INITIALIZER;
61 pthread_cond_t qemu_vcpu_cond = PTHREAD_COND_INITIALIZER;
62 pthread_cond_t qemu_system_cond = PTHREAD_COND_INITIALIZER;
63 pthread_cond_t qemu_pause_cond = PTHREAD_COND_INITIALIZER;
64 pthread_cond_t qemu_work_cond = PTHREAD_COND_INITIALIZER;
65 __thread CPUState *current_env;
67 static int qemu_system_ready;
69 #define SIG_IPI (SIGRTMIN+4)
71 CPUState *kvm_debug_cpu_requested;
73 #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
74 /* The list of ioperm_data */
75 static QLIST_HEAD(, ioperm_data) ioperm_head;
76 #endif
78 #define ALIGN(x, y) (((x)+(y)-1) & ~((y)-1))
80 static int handle_unhandled(uint64_t reason)
82 fprintf(stderr, "kvm: unhandled exit %" PRIx64 "\n", reason);
83 return -EINVAL;
86 #define VMX_INVALID_GUEST_STATE 0x80000021
88 static int handle_failed_vmentry(uint64_t reason)
90 fprintf(stderr, "kvm: vm entry failed with error 0x%" PRIx64 "\n\n", reason);
92 /* Perhaps we will need to check if this machine is intel since exit reason 0x21
93 has a different interpretation on SVM */
94 if (reason == VMX_INVALID_GUEST_STATE) {
95 fprintf(stderr, "If you're runnning a guest on an Intel machine without\n");
96 fprintf(stderr, "unrestricted mode support, the failure can be most likely\n");
97 fprintf(stderr, "due to the guest entering an invalid state for Intel VT.\n");
98 fprintf(stderr, "For example, the guest maybe running in big real mode\n");
99 fprintf(stderr, "which is not supported on less recent Intel processors.\n\n");
102 return -EINVAL;
105 static inline void set_gsi(kvm_context_t kvm, unsigned int gsi)
107 uint32_t *bitmap = kvm->used_gsi_bitmap;
109 if (gsi < kvm->max_gsi)
110 bitmap[gsi / 32] |= 1U << (gsi % 32);
111 else
112 DPRINTF("Invalid GSI %u\n", gsi);
115 static inline void clear_gsi(kvm_context_t kvm, unsigned int gsi)
117 uint32_t *bitmap = kvm->used_gsi_bitmap;
119 if (gsi < kvm->max_gsi)
120 bitmap[gsi / 32] &= ~(1U << (gsi % 32));
121 else
122 DPRINTF("Invalid GSI %u\n", gsi);
125 static int kvm_create_context(void);
127 int kvm_init(void)
129 int fd;
130 int r, gsi_count, i;
133 fd = open("/dev/kvm", O_RDWR);
134 if (fd == -1) {
135 perror("open /dev/kvm");
136 return -1;
138 r = ioctl(fd, KVM_GET_API_VERSION, 0);
139 if (r == -1) {
140 fprintf(stderr,
141 "kvm kernel version too old: "
142 "KVM_GET_API_VERSION ioctl not supported\n");
143 goto out_close;
145 if (r < EXPECTED_KVM_API_VERSION) {
146 fprintf(stderr, "kvm kernel version too old: "
147 "We expect API version %d or newer, but got "
148 "version %d\n", EXPECTED_KVM_API_VERSION, r);
149 goto out_close;
151 if (r > EXPECTED_KVM_API_VERSION) {
152 fprintf(stderr, "kvm userspace version too old\n");
153 goto out_close;
155 kvm_state = qemu_mallocz(sizeof(*kvm_state));
156 kvm_context = &kvm_state->kvm_context;
158 kvm_state->fd = fd;
159 kvm_state->vmfd = -1;
161 #ifdef KVM_CAP_SET_GUEST_DEBUG
162 QTAILQ_INIT(&kvm_state->kvm_sw_breakpoints);
163 #endif
165 for (i = 0; i < ARRAY_SIZE(kvm_state->slots); i++) {
166 kvm_state->slots[i].slot = i;
169 #ifdef KVM_CAP_USER_MEMORY
170 r = kvm_ioctl(kvm_state, KVM_CHECK_EXTENSION, KVM_CAP_USER_MEMORY);
171 if (r <= 0) {
172 fprintf(stderr,
173 "Hypervisor too old: KVM_CAP_USER_MEMORY extension not supported\n");
174 goto out_close;
176 #else
177 #error Hypervisor too old: KVM_CAP_USER_MEMORY extension not supported
178 #endif
180 gsi_count = kvm_get_gsi_count(kvm_context);
181 if (gsi_count > 0) {
182 int gsi_bits, i;
184 /* Round up so we can search ints using ffs */
185 gsi_bits = ALIGN(gsi_count, 32);
186 kvm_context->used_gsi_bitmap = qemu_mallocz(gsi_bits / 8);
187 kvm_context->max_gsi = gsi_bits;
189 /* Mark any over-allocated bits as already in use */
190 for (i = gsi_count; i < gsi_bits; i++) {
191 set_gsi(kvm_context, i);
195 kvm_cpu_register_phys_memory_client();
197 pthread_mutex_lock(&qemu_mutex);
198 return kvm_create_context();
200 out_close:
201 close(fd);
202 return -1;
205 static void kvm_finalize(KVMState *s)
207 /* FIXME
208 if (kvm->vcpu_fd[0] != -1)
209 close(kvm->vcpu_fd[0]);
210 if (kvm->vm_fd != -1)
211 close(kvm->vm_fd);
213 close(s->fd);
214 free(s);
217 static int kvm_set_boot_vcpu_id(kvm_context_t kvm, uint32_t id)
219 #ifdef KVM_CAP_SET_BOOT_CPU_ID
220 int r = kvm_ioctl(kvm_state, KVM_CHECK_EXTENSION, KVM_CAP_SET_BOOT_CPU_ID);
221 if (r > 0) {
222 return kvm_vm_ioctl(kvm_state, KVM_SET_BOOT_CPU_ID, id);
224 return -ENOSYS;
225 #else
226 return -ENOSYS;
227 #endif
230 void kvm_create_irqchip(kvm_context_t kvm)
232 int r;
234 #ifdef KVM_CAP_IRQCHIP
235 if (kvm_irqchip) {
236 r = kvm_ioctl(kvm_state, KVM_CHECK_EXTENSION, KVM_CAP_IRQCHIP);
237 if (r > 0) { /* kernel irqchip supported */
238 r = kvm_vm_ioctl(kvm_state, KVM_CREATE_IRQCHIP);
239 if (r >= 0) {
240 kvm->irqchip_inject_ioctl = KVM_IRQ_LINE;
241 #if defined(KVM_CAP_IRQ_INJECT_STATUS) && defined(KVM_IRQ_LINE_STATUS)
242 r = kvm_ioctl(kvm_state, KVM_CHECK_EXTENSION,
243 KVM_CAP_IRQ_INJECT_STATUS);
244 if (r > 0) {
245 kvm->irqchip_inject_ioctl = KVM_IRQ_LINE_STATUS;
247 #endif
248 kvm_state->irqchip_in_kernel = 1;
249 } else
250 fprintf(stderr, "Create kernel PIC irqchip failed\n");
253 #endif
256 #ifdef KVM_CAP_IRQCHIP
258 int kvm_set_irq_level(kvm_context_t kvm, int irq, int level, int *status)
260 struct kvm_irq_level event;
261 int r;
263 if (!kvm_state->irqchip_in_kernel) {
264 return 0;
266 event.level = level;
267 event.irq = irq;
268 r = kvm_vm_ioctl(kvm_state, kvm->irqchip_inject_ioctl, &event);
269 if (r < 0) {
270 perror("kvm_set_irq_level");
273 if (status) {
274 #ifdef KVM_CAP_IRQ_INJECT_STATUS
275 *status =
276 (kvm->irqchip_inject_ioctl == KVM_IRQ_LINE) ? 1 : event.status;
277 #else
278 *status = 1;
279 #endif
282 return 1;
285 int kvm_get_irqchip(kvm_context_t kvm, struct kvm_irqchip *chip)
287 int r;
289 if (!kvm_state->irqchip_in_kernel) {
290 return 0;
292 r = kvm_vm_ioctl(kvm_state, KVM_GET_IRQCHIP, chip);
293 if (r < 0) {
294 perror("kvm_get_irqchip\n");
296 return r;
299 int kvm_set_irqchip(kvm_context_t kvm, struct kvm_irqchip *chip)
301 int r;
303 if (!kvm_state->irqchip_in_kernel) {
304 return 0;
306 r = kvm_vm_ioctl(kvm_state, KVM_SET_IRQCHIP, chip);
307 if (r < 0) {
308 perror("kvm_set_irqchip\n");
310 return r;
313 #endif
315 static int handle_mmio(CPUState *env)
317 unsigned long addr = env->kvm_run->mmio.phys_addr;
318 struct kvm_run *kvm_run = env->kvm_run;
319 void *data = kvm_run->mmio.data;
321 /* hack: Red Hat 7.1 generates these weird accesses. */
322 if ((addr > 0xa0000 - 4 && addr <= 0xa0000) && kvm_run->mmio.len == 3) {
323 return 0;
326 cpu_physical_memory_rw(addr, data, kvm_run->mmio.len, kvm_run->mmio.is_write);
327 return 0;
330 int handle_io_window(kvm_context_t kvm)
332 return 1;
335 int handle_shutdown(kvm_context_t kvm, CPUState *env)
337 /* stop the current vcpu from going back to guest mode */
338 env->stopped = 1;
340 qemu_system_reset_request();
341 return 1;
344 static inline void push_nmi(kvm_context_t kvm)
346 #ifdef KVM_CAP_USER_NMI
347 kvm_arch_push_nmi();
348 #endif /* KVM_CAP_USER_NMI */
351 void post_kvm_run(kvm_context_t kvm, CPUState *env)
353 pthread_mutex_lock(&qemu_mutex);
354 kvm_arch_post_run(env, env->kvm_run);
355 cpu_single_env = env;
358 int pre_kvm_run(kvm_context_t kvm, CPUState *env)
360 kvm_arch_pre_run(env, env->kvm_run);
362 pthread_mutex_unlock(&qemu_mutex);
363 return 0;
366 int kvm_is_ready_for_interrupt_injection(CPUState *env)
368 return env->kvm_run->ready_for_interrupt_injection;
371 int kvm_run(CPUState *env)
373 int r;
374 kvm_context_t kvm = &env->kvm_state->kvm_context;
375 struct kvm_run *run = env->kvm_run;
376 int fd = env->kvm_fd;
378 again:
379 if (env->kvm_vcpu_dirty) {
380 kvm_arch_put_registers(env, KVM_PUT_RUNTIME_STATE);
381 env->kvm_vcpu_dirty = 0;
383 push_nmi(kvm);
384 #if !defined(__s390__)
385 if (!kvm_state->irqchip_in_kernel) {
386 run->request_interrupt_window = kvm_arch_try_push_interrupts(env);
388 #endif
390 r = pre_kvm_run(kvm, env);
391 if (r) {
392 return r;
394 if (env->exit_request) {
395 env->exit_request = 0;
396 pthread_kill(env->thread->thread, SIG_IPI);
398 r = ioctl(fd, KVM_RUN, 0);
400 if (r == -1 && errno != EINTR && errno != EAGAIN) {
401 r = -errno;
402 post_kvm_run(kvm, env);
403 fprintf(stderr, "kvm_run: %s\n", strerror(-r));
404 return r;
407 post_kvm_run(kvm, env);
409 kvm_flush_coalesced_mmio_buffer();
411 #if !defined(__s390__)
412 if (r == -1) {
413 r = handle_io_window(kvm);
414 goto more;
416 #endif
417 if (1) {
418 switch (run->exit_reason) {
419 case KVM_EXIT_UNKNOWN:
420 r = handle_unhandled(run->hw.hardware_exit_reason);
421 break;
422 case KVM_EXIT_FAIL_ENTRY:
423 r = handle_failed_vmentry(run->fail_entry.hardware_entry_failure_reason);
424 break;
425 case KVM_EXIT_EXCEPTION:
426 fprintf(stderr, "exception %d (%x)\n", run->ex.exception,
427 run->ex.error_code);
428 cpu_dump_state(env, stderr, fprintf, CPU_DUMP_CODE);
429 abort();
430 break;
431 case KVM_EXIT_IO:
432 kvm_handle_io(run->io.port,
433 (uint8_t *)run + run->io.data_offset,
434 run->io.direction,
435 run->io.size,
436 run->io.count);
437 r = 0;
438 break;
439 case KVM_EXIT_MMIO:
440 r = handle_mmio(env);
441 break;
442 case KVM_EXIT_HLT:
443 r = kvm_arch_halt(env);
444 break;
445 case KVM_EXIT_IRQ_WINDOW_OPEN:
446 break;
447 case KVM_EXIT_SHUTDOWN:
448 r = handle_shutdown(kvm, env);
449 break;
450 #if defined(__s390__)
451 case KVM_EXIT_S390_SIEIC:
452 r = kvm_s390_handle_intercept(kvm, env, run);
453 break;
454 case KVM_EXIT_S390_RESET:
455 r = kvm_s390_handle_reset(kvm, env, run);
456 break;
457 #endif
458 case KVM_EXIT_INTERNAL_ERROR:
459 r = kvm_handle_internal_error(env, run);
460 break;
461 default:
462 r = kvm_arch_run(env);
463 if (r < 0) {
464 fprintf(stderr, "unhandled vm exit: 0x%x\n", run->exit_reason);
465 cpu_dump_state(env, stderr, fprintf, CPU_DUMP_CODE);
466 abort();
468 if (r > 0) {
469 return r;
471 break;
474 more:
475 if (!r) {
476 goto again;
478 return r;
481 int kvm_inject_irq(CPUState *env, unsigned irq)
483 struct kvm_interrupt intr;
485 intr.irq = irq;
486 return kvm_vcpu_ioctl(env, KVM_INTERRUPT, &intr);
489 int kvm_inject_nmi(CPUState *env)
491 #ifdef KVM_CAP_USER_NMI
492 return kvm_vcpu_ioctl(env, KVM_NMI);
493 #else
494 return -ENOSYS;
495 #endif
498 #ifdef KVM_CAP_DEVICE_ASSIGNMENT
499 int kvm_assign_pci_device(kvm_context_t kvm,
500 struct kvm_assigned_pci_dev *assigned_dev)
502 return kvm_vm_ioctl(kvm_state, KVM_ASSIGN_PCI_DEVICE, assigned_dev);
505 static int kvm_old_assign_irq(kvm_context_t kvm,
506 struct kvm_assigned_irq *assigned_irq)
508 return kvm_vm_ioctl(kvm_state, KVM_ASSIGN_IRQ, assigned_irq);
511 #ifdef KVM_CAP_ASSIGN_DEV_IRQ
512 int kvm_assign_irq(kvm_context_t kvm, struct kvm_assigned_irq *assigned_irq)
514 int ret;
516 ret = kvm_ioctl(kvm_state, KVM_CHECK_EXTENSION, KVM_CAP_ASSIGN_DEV_IRQ);
517 if (ret > 0) {
518 return kvm_vm_ioctl(kvm_state, KVM_ASSIGN_DEV_IRQ, assigned_irq);
521 return kvm_old_assign_irq(kvm, assigned_irq);
524 int kvm_deassign_irq(kvm_context_t kvm, struct kvm_assigned_irq *assigned_irq)
526 return kvm_vm_ioctl(kvm_state, KVM_DEASSIGN_DEV_IRQ, assigned_irq);
528 #else
529 int kvm_assign_irq(kvm_context_t kvm, struct kvm_assigned_irq *assigned_irq)
531 return kvm_old_assign_irq(kvm, assigned_irq);
533 #endif
534 #endif
536 #ifdef KVM_CAP_DEVICE_DEASSIGNMENT
537 int kvm_deassign_pci_device(kvm_context_t kvm,
538 struct kvm_assigned_pci_dev *assigned_dev)
540 return kvm_vm_ioctl(kvm_state, KVM_DEASSIGN_PCI_DEVICE, assigned_dev);
542 #endif
544 int kvm_reinject_control(kvm_context_t kvm, int pit_reinject)
546 #ifdef KVM_CAP_REINJECT_CONTROL
547 int r;
548 struct kvm_reinject_control control;
550 control.pit_reinject = pit_reinject;
552 r = kvm_ioctl(kvm_state, KVM_CHECK_EXTENSION, KVM_CAP_REINJECT_CONTROL);
553 if (r > 0) {
554 return kvm_vm_ioctl(kvm_state, KVM_REINJECT_CONTROL, &control);
556 #endif
557 return -ENOSYS;
560 int kvm_has_gsi_routing(void)
562 int r = 0;
564 #ifdef KVM_CAP_IRQ_ROUTING
565 r = kvm_check_extension(kvm_state, KVM_CAP_IRQ_ROUTING);
566 #endif
567 return r;
570 int kvm_get_gsi_count(kvm_context_t kvm)
572 #ifdef KVM_CAP_IRQ_ROUTING
573 return kvm_check_extension(kvm_state, KVM_CAP_IRQ_ROUTING);
574 #else
575 return -EINVAL;
576 #endif
579 int kvm_clear_gsi_routes(void)
581 #ifdef KVM_CAP_IRQ_ROUTING
582 kvm_context_t kvm = kvm_context;
584 kvm->irq_routes->nr = 0;
585 return 0;
586 #else
587 return -EINVAL;
588 #endif
591 int kvm_add_routing_entry(struct kvm_irq_routing_entry *entry)
593 #ifdef KVM_CAP_IRQ_ROUTING
594 kvm_context_t kvm = kvm_context;
595 struct kvm_irq_routing *z;
596 struct kvm_irq_routing_entry *new;
597 int n, size;
599 if (kvm->irq_routes->nr == kvm->nr_allocated_irq_routes) {
600 n = kvm->nr_allocated_irq_routes * 2;
601 if (n < 64) {
602 n = 64;
604 size = sizeof(struct kvm_irq_routing);
605 size += n * sizeof(*new);
606 z = realloc(kvm->irq_routes, size);
607 if (!z) {
608 return -ENOMEM;
610 kvm->nr_allocated_irq_routes = n;
611 kvm->irq_routes = z;
613 n = kvm->irq_routes->nr++;
614 new = &kvm->irq_routes->entries[n];
615 memset(new, 0, sizeof(*new));
616 new->gsi = entry->gsi;
617 new->type = entry->type;
618 new->flags = entry->flags;
619 new->u = entry->u;
621 set_gsi(kvm, entry->gsi);
623 return 0;
624 #else
625 return -ENOSYS;
626 #endif
629 int kvm_add_irq_route(int gsi, int irqchip, int pin)
631 #ifdef KVM_CAP_IRQ_ROUTING
632 struct kvm_irq_routing_entry e;
634 e.gsi = gsi;
635 e.type = KVM_IRQ_ROUTING_IRQCHIP;
636 e.flags = 0;
637 e.u.irqchip.irqchip = irqchip;
638 e.u.irqchip.pin = pin;
639 return kvm_add_routing_entry(&e);
640 #else
641 return -ENOSYS;
642 #endif
645 int kvm_del_routing_entry(struct kvm_irq_routing_entry *entry)
647 #ifdef KVM_CAP_IRQ_ROUTING
648 kvm_context_t kvm = kvm_context;
649 struct kvm_irq_routing_entry *e, *p;
650 int i, gsi, found = 0;
652 gsi = entry->gsi;
654 for (i = 0; i < kvm->irq_routes->nr; ++i) {
655 e = &kvm->irq_routes->entries[i];
656 if (e->type == entry->type && e->gsi == gsi) {
657 switch (e->type) {
658 case KVM_IRQ_ROUTING_IRQCHIP:{
659 if (e->u.irqchip.irqchip ==
660 entry->u.irqchip.irqchip
661 && e->u.irqchip.pin == entry->u.irqchip.pin) {
662 p = &kvm->irq_routes->entries[--kvm->irq_routes->nr];
663 *e = *p;
664 found = 1;
666 break;
668 case KVM_IRQ_ROUTING_MSI:{
669 if (e->u.msi.address_lo ==
670 entry->u.msi.address_lo
671 && e->u.msi.address_hi ==
672 entry->u.msi.address_hi
673 && e->u.msi.data == entry->u.msi.data) {
674 p = &kvm->irq_routes->entries[--kvm->irq_routes->nr];
675 *e = *p;
676 found = 1;
678 break;
680 default:
681 break;
683 if (found) {
684 /* If there are no other users of this GSI
685 * mark it available in the bitmap */
686 for (i = 0; i < kvm->irq_routes->nr; i++) {
687 e = &kvm->irq_routes->entries[i];
688 if (e->gsi == gsi)
689 break;
691 if (i == kvm->irq_routes->nr) {
692 clear_gsi(kvm, gsi);
695 return 0;
699 return -ESRCH;
700 #else
701 return -ENOSYS;
702 #endif
705 int kvm_update_routing_entry(struct kvm_irq_routing_entry *entry,
706 struct kvm_irq_routing_entry *newentry)
708 #ifdef KVM_CAP_IRQ_ROUTING
709 kvm_context_t kvm = kvm_context;
710 struct kvm_irq_routing_entry *e;
711 int i;
713 if (entry->gsi != newentry->gsi || entry->type != newentry->type) {
714 return -EINVAL;
717 for (i = 0; i < kvm->irq_routes->nr; ++i) {
718 e = &kvm->irq_routes->entries[i];
719 if (e->type != entry->type || e->gsi != entry->gsi) {
720 continue;
722 switch (e->type) {
723 case KVM_IRQ_ROUTING_IRQCHIP:
724 if (e->u.irqchip.irqchip == entry->u.irqchip.irqchip &&
725 e->u.irqchip.pin == entry->u.irqchip.pin) {
726 memcpy(&e->u.irqchip, &newentry->u.irqchip,
727 sizeof e->u.irqchip);
728 return 0;
730 break;
731 case KVM_IRQ_ROUTING_MSI:
732 if (e->u.msi.address_lo == entry->u.msi.address_lo &&
733 e->u.msi.address_hi == entry->u.msi.address_hi &&
734 e->u.msi.data == entry->u.msi.data) {
735 memcpy(&e->u.msi, &newentry->u.msi, sizeof e->u.msi);
736 return 0;
738 break;
739 default:
740 break;
743 return -ESRCH;
744 #else
745 return -ENOSYS;
746 #endif
749 int kvm_del_irq_route(int gsi, int irqchip, int pin)
751 #ifdef KVM_CAP_IRQ_ROUTING
752 struct kvm_irq_routing_entry e;
754 e.gsi = gsi;
755 e.type = KVM_IRQ_ROUTING_IRQCHIP;
756 e.flags = 0;
757 e.u.irqchip.irqchip = irqchip;
758 e.u.irqchip.pin = pin;
759 return kvm_del_routing_entry(&e);
760 #else
761 return -ENOSYS;
762 #endif
765 int kvm_commit_irq_routes(void)
767 #ifdef KVM_CAP_IRQ_ROUTING
768 kvm_context_t kvm = kvm_context;
770 kvm->irq_routes->flags = 0;
771 return kvm_vm_ioctl(kvm_state, KVM_SET_GSI_ROUTING, kvm->irq_routes);
772 #else
773 return -ENOSYS;
774 #endif
777 int kvm_get_irq_route_gsi(void)
779 kvm_context_t kvm = kvm_context;
780 int i, bit;
781 uint32_t *buf = kvm->used_gsi_bitmap;
783 /* Return the lowest unused GSI in the bitmap */
784 for (i = 0; i < kvm->max_gsi / 32; i++) {
785 bit = ffs(~buf[i]);
786 if (!bit) {
787 continue;
790 return bit - 1 + i * 32;
793 return -ENOSPC;
796 static void kvm_msi_routing_entry(struct kvm_irq_routing_entry *e,
797 KVMMsiMessage *msg)
800 e->gsi = msg->gsi;
801 e->type = KVM_IRQ_ROUTING_MSI;
802 e->flags = 0;
803 e->u.msi.address_lo = msg->addr_lo;
804 e->u.msi.address_hi = msg->addr_hi;
805 e->u.msi.data = msg->data;
808 int kvm_msi_message_add(KVMMsiMessage *msg)
810 struct kvm_irq_routing_entry e;
811 int ret;
813 ret = kvm_get_irq_route_gsi();
814 if (ret < 0) {
815 return ret;
817 msg->gsi = ret;
819 kvm_msi_routing_entry(&e, msg);
820 return kvm_add_routing_entry(&e);
823 int kvm_msi_message_del(KVMMsiMessage *msg)
825 struct kvm_irq_routing_entry e;
827 kvm_msi_routing_entry(&e, msg);
828 return kvm_del_routing_entry(&e);
831 int kvm_msi_message_update(KVMMsiMessage *old, KVMMsiMessage *new)
833 struct kvm_irq_routing_entry e1, e2;
834 int ret;
836 new->gsi = old->gsi;
837 if (memcmp(old, new, sizeof(KVMMsiMessage)) == 0) {
838 return 0;
841 kvm_msi_routing_entry(&e1, old);
842 kvm_msi_routing_entry(&e2, new);
844 ret = kvm_update_routing_entry(&e1, &e2);
845 if (ret < 0) {
846 return ret;
849 return 1;
853 #ifdef KVM_CAP_DEVICE_MSIX
854 int kvm_assign_set_msix_nr(kvm_context_t kvm,
855 struct kvm_assigned_msix_nr *msix_nr)
857 return kvm_vm_ioctl(kvm_state, KVM_ASSIGN_SET_MSIX_NR, msix_nr);
860 int kvm_assign_set_msix_entry(kvm_context_t kvm,
861 struct kvm_assigned_msix_entry *entry)
863 return kvm_vm_ioctl(kvm_state, KVM_ASSIGN_SET_MSIX_ENTRY, entry);
865 #endif
867 unsigned long kvm_get_thread_id(void)
869 return syscall(SYS_gettid);
872 static void kvm_cond_wait(pthread_cond_t *cond)
874 CPUState *env = cpu_single_env;
876 pthread_cond_wait(cond, &qemu_mutex);
877 cpu_single_env = env;
880 static void sig_ipi_handler(int n)
884 static void sigbus_reraise(void)
886 sigset_t set;
887 struct sigaction action;
889 memset(&action, 0, sizeof(action));
890 action.sa_handler = SIG_DFL;
891 if (!sigaction(SIGBUS, &action, NULL)) {
892 raise(SIGBUS);
893 sigemptyset(&set);
894 sigaddset(&set, SIGBUS);
895 sigprocmask(SIG_UNBLOCK, &set, NULL);
897 perror("Failed to re-raise SIGBUS!\n");
898 abort();
901 static void sigbus_handler(int n, struct qemu_signalfd_siginfo *siginfo,
902 void *ctx)
904 if (kvm_on_sigbus(siginfo->ssi_code, (void *)(intptr_t)siginfo->ssi_addr))
905 sigbus_reraise();
908 void on_vcpu(CPUState *env, void (*func)(void *data), void *data)
910 struct qemu_work_item wi;
912 if (env == current_env) {
913 func(data);
914 return;
917 wi.func = func;
918 wi.data = data;
919 if (!env->kvm_cpu_state.queued_work_first) {
920 env->kvm_cpu_state.queued_work_first = &wi;
921 } else {
922 env->kvm_cpu_state.queued_work_last->next = &wi;
924 env->kvm_cpu_state.queued_work_last = &wi;
925 wi.next = NULL;
926 wi.done = false;
928 pthread_kill(env->thread->thread, SIG_IPI);
929 while (!wi.done) {
930 kvm_cond_wait(&qemu_work_cond);
934 static void inject_interrupt(void *data)
936 cpu_interrupt(current_env, (long) data);
939 void kvm_inject_interrupt(CPUState *env, int mask)
941 on_vcpu(env, inject_interrupt, (void *) (long) mask);
944 void kvm_update_interrupt_request(CPUState *env)
946 int signal = 0;
948 if (env) {
949 if (!current_env || !current_env->created) {
950 signal = 1;
953 * Testing for created here is really redundant
955 if (current_env && current_env->created &&
956 env != current_env && !env->kvm_cpu_state.signalled) {
957 signal = 1;
960 if (signal) {
961 env->kvm_cpu_state.signalled = 1;
962 if (env->thread) {
963 pthread_kill(env->thread->thread, SIG_IPI);
969 int kvm_cpu_exec(CPUState *env)
971 int r;
973 r = kvm_run(env);
974 if (r < 0) {
975 printf("kvm_run returned %d\n", r);
976 cpu_dump_state(env, stderr, fprintf, CPU_DUMP_CODE);
977 vm_stop(VMSTOP_PANIC);
980 return 0;
983 int kvm_cpu_is_stopped(CPUState *env)
985 return !vm_running || env->stopped;
988 static void flush_queued_work(CPUState *env)
990 struct qemu_work_item *wi;
992 if (!env->kvm_cpu_state.queued_work_first) {
993 return;
996 while ((wi = env->kvm_cpu_state.queued_work_first)) {
997 env->kvm_cpu_state.queued_work_first = wi->next;
998 wi->func(wi->data);
999 wi->done = true;
1001 env->kvm_cpu_state.queued_work_last = NULL;
1002 pthread_cond_broadcast(&qemu_work_cond);
1005 static void kvm_main_loop_wait(CPUState *env, int timeout)
1007 struct timespec ts;
1008 int r, e;
1009 siginfo_t siginfo;
1010 sigset_t waitset;
1011 sigset_t chkset;
1013 ts.tv_sec = timeout / 1000;
1014 ts.tv_nsec = (timeout % 1000) * 1000000;
1015 sigemptyset(&waitset);
1016 sigaddset(&waitset, SIG_IPI);
1017 sigaddset(&waitset, SIGBUS);
1019 do {
1020 pthread_mutex_unlock(&qemu_mutex);
1022 r = sigtimedwait(&waitset, &siginfo, &ts);
1023 e = errno;
1025 pthread_mutex_lock(&qemu_mutex);
1027 if (r == -1 && !(e == EAGAIN || e == EINTR)) {
1028 printf("sigtimedwait: %s\n", strerror(e));
1029 exit(1);
1032 switch (r) {
1033 case SIGBUS:
1034 if (kvm_on_sigbus_vcpu(env, siginfo.si_code, siginfo.si_addr))
1035 sigbus_reraise();
1036 break;
1037 default:
1038 break;
1041 r = sigpending(&chkset);
1042 if (r == -1) {
1043 printf("sigpending: %s\n", strerror(e));
1044 exit(1);
1046 } while (sigismember(&chkset, SIG_IPI) || sigismember(&chkset, SIGBUS));
1048 cpu_single_env = env;
1049 flush_queued_work(env);
1051 if (env->stop) {
1052 env->stop = 0;
1053 env->stopped = 1;
1054 pthread_cond_signal(&qemu_pause_cond);
1057 env->kvm_cpu_state.signalled = 0;
1060 static int all_threads_paused(void)
1062 CPUState *penv = first_cpu;
1064 while (penv) {
1065 if (penv->stop) {
1066 return 0;
1068 penv = (CPUState *) penv->next_cpu;
1071 return 1;
1074 static void pause_all_threads(void)
1076 CPUState *penv = first_cpu;
1078 while (penv) {
1079 if (penv != cpu_single_env) {
1080 penv->stop = 1;
1081 pthread_kill(penv->thread->thread, SIG_IPI);
1082 } else {
1083 penv->stop = 0;
1084 penv->stopped = 1;
1085 cpu_exit(penv);
1087 penv = (CPUState *) penv->next_cpu;
1090 while (!all_threads_paused()) {
1091 kvm_cond_wait(&qemu_pause_cond);
1095 static void resume_all_threads(void)
1097 CPUState *penv = first_cpu;
1099 assert(!cpu_single_env);
1101 while (penv) {
1102 penv->stop = 0;
1103 penv->stopped = 0;
1104 pthread_kill(penv->thread->thread, SIG_IPI);
1105 penv = (CPUState *) penv->next_cpu;
1109 static void kvm_vm_state_change_handler(void *context, int running, int reason)
1111 if (running) {
1112 resume_all_threads();
1113 } else {
1114 pause_all_threads();
1118 static void setup_kernel_sigmask(CPUState *env)
1120 sigset_t set;
1122 sigemptyset(&set);
1123 sigaddset(&set, SIGUSR2);
1124 sigaddset(&set, SIGIO);
1125 sigaddset(&set, SIGALRM);
1126 sigprocmask(SIG_BLOCK, &set, NULL);
1128 sigprocmask(SIG_BLOCK, NULL, &set);
1129 sigdelset(&set, SIG_IPI);
1130 sigdelset(&set, SIGBUS);
1132 kvm_set_signal_mask(env, &set);
1135 static void qemu_kvm_system_reset(void)
1137 pause_all_threads();
1139 cpu_synchronize_all_states();
1140 qemu_system_reset();
1142 resume_all_threads();
1145 static void process_irqchip_events(CPUState *env)
1147 kvm_arch_process_irqchip_events(env);
1148 if (kvm_arch_has_work(env))
1149 env->halted = 0;
1152 static int kvm_main_loop_cpu(CPUState *env)
1154 while (1) {
1155 int run_cpu = !kvm_cpu_is_stopped(env);
1156 if (run_cpu) {
1157 kvm_arch_process_async_events(env);
1158 if (!kvm_irqchip_in_kernel()) {
1159 process_irqchip_events(env);
1160 run_cpu = !env->halted;
1163 if (run_cpu) {
1164 kvm_cpu_exec(env);
1165 kvm_main_loop_wait(env, 0);
1166 } else {
1167 kvm_main_loop_wait(env, 1000);
1170 pthread_mutex_unlock(&qemu_mutex);
1171 return 0;
1174 static void *ap_main_loop(void *_env)
1176 CPUState *env = _env;
1177 #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
1178 struct ioperm_data *data = NULL;
1179 #endif
1181 current_env = env;
1182 env->thread_id = kvm_get_thread_id();
1184 #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
1185 /* do ioperm for io ports of assigned devices */
1186 QLIST_FOREACH(data, &ioperm_head, entries)
1187 on_vcpu(env, kvm_arch_do_ioperm, data);
1188 #endif
1190 pthread_mutex_lock(&qemu_mutex);
1191 cpu_single_env = env;
1193 if (kvm_create_vcpu(env) < 0) {
1194 abort();
1196 setup_kernel_sigmask(env);
1198 /* signal VCPU creation */
1199 current_env->created = 1;
1200 pthread_cond_signal(&qemu_vcpu_cond);
1202 /* and wait for machine initialization */
1203 while (!qemu_system_ready) {
1204 kvm_cond_wait(&qemu_system_cond);
1207 /* re-initialize cpu_single_env after re-acquiring qemu_mutex */
1208 cpu_single_env = env;
1210 kvm_main_loop_cpu(env);
1211 return NULL;
1214 int kvm_init_vcpu(CPUState *env)
1216 env->thread = qemu_mallocz(sizeof(QemuThread));
1217 qemu_thread_create(env->thread, ap_main_loop, env);
1219 while (env->created == 0) {
1220 kvm_cond_wait(&qemu_vcpu_cond);
1223 return 0;
1226 #ifdef TARGET_I386
1227 void kvm_hpet_disable_kpit(void)
1229 struct kvm_pit_state2 ps2;
1231 kvm_get_pit2(kvm_context, &ps2);
1232 ps2.flags |= KVM_PIT_FLAGS_HPET_LEGACY;
1233 kvm_set_pit2(kvm_context, &ps2);
1236 void kvm_hpet_enable_kpit(void)
1238 struct kvm_pit_state2 ps2;
1240 kvm_get_pit2(kvm_context, &ps2);
1241 ps2.flags &= ~KVM_PIT_FLAGS_HPET_LEGACY;
1242 kvm_set_pit2(kvm_context, &ps2);
1244 #endif
1246 int kvm_init_ap(void)
1248 struct sigaction action;
1250 qemu_add_vm_change_state_handler(kvm_vm_state_change_handler, NULL);
1252 signal(SIG_IPI, sig_ipi_handler);
1254 memset(&action, 0, sizeof(action));
1255 action.sa_flags = SA_SIGINFO;
1256 action.sa_sigaction = (void (*)(int, siginfo_t*, void*))sigbus_handler;
1257 sigaction(SIGBUS, &action, NULL);
1258 prctl(PR_MCE_KILL, 1, 1, 0, 0);
1259 return 0;
1262 /* If we have signalfd, we mask out the signals we want to handle and then
1263 * use signalfd to listen for them. We rely on whatever the current signal
1264 * handler is to dispatch the signals when we receive them.
1267 static void sigfd_handler(void *opaque)
1269 int fd = (unsigned long) opaque;
1270 struct qemu_signalfd_siginfo info;
1271 struct sigaction action;
1272 ssize_t len;
1274 while (1) {
1275 do {
1276 len = read(fd, &info, sizeof(info));
1277 } while (len == -1 && errno == EINTR);
1279 if (len == -1 && errno == EAGAIN) {
1280 break;
1283 if (len != sizeof(info)) {
1284 printf("read from sigfd returned %zd: %m\n", len);
1285 return;
1288 sigaction(info.ssi_signo, NULL, &action);
1289 if ((action.sa_flags & SA_SIGINFO) && action.sa_sigaction) {
1290 action.sa_sigaction(info.ssi_signo,
1291 (siginfo_t *)&info, NULL);
1292 } else if (action.sa_handler) {
1293 action.sa_handler(info.ssi_signo);
1298 int kvm_main_loop(void)
1300 sigset_t mask;
1301 int sigfd;
1303 qemu_system_ready = 1;
1305 sigemptyset(&mask);
1306 sigaddset(&mask, SIGIO);
1307 sigaddset(&mask, SIGALRM);
1308 sigaddset(&mask, SIGBUS);
1309 sigprocmask(SIG_BLOCK, &mask, NULL);
1311 sigfd = qemu_signalfd(&mask);
1312 if (sigfd == -1) {
1313 fprintf(stderr, "failed to create signalfd\n");
1314 return -errno;
1317 fcntl(sigfd, F_SETFL, O_NONBLOCK);
1319 qemu_set_fd_handler2(sigfd, NULL, sigfd_handler, NULL,
1320 (void *)(unsigned long) sigfd);
1322 pthread_cond_broadcast(&qemu_system_cond);
1324 cpu_single_env = NULL;
1326 while (1) {
1327 main_loop_wait(0);
1328 if (qemu_shutdown_requested()) {
1329 monitor_protocol_event(QEVENT_SHUTDOWN, NULL);
1330 if (qemu_no_shutdown()) {
1331 vm_stop(VMSTOP_SHUTDOWN);
1332 } else {
1333 break;
1335 } else if (qemu_powerdown_requested()) {
1336 monitor_protocol_event(QEVENT_POWERDOWN, NULL);
1337 qemu_irq_raise(qemu_system_powerdown);
1338 } else if (qemu_reset_requested()) {
1339 qemu_kvm_system_reset();
1340 } else if (kvm_debug_cpu_requested) {
1341 gdb_set_stop_cpu(kvm_debug_cpu_requested);
1342 vm_stop(VMSTOP_DEBUG);
1343 kvm_debug_cpu_requested = NULL;
1347 bdrv_close_all();
1348 pause_all_threads();
1349 pthread_mutex_unlock(&qemu_mutex);
1351 return 0;
1354 #if !defined(TARGET_I386)
1355 int kvm_arch_init_irq_routing(void)
1357 return 0;
1359 #endif
1361 extern int no_hpet;
1363 static int kvm_create_context(void)
1365 static const char upgrade_note[] =
1366 "Please upgrade to at least kernel 2.6.29 or recent kvm-kmod\n"
1367 "(see http://sourceforge.net/projects/kvm).\n";
1369 int r;
1371 kvm_state->pit_in_kernel = kvm_pit;
1373 #ifdef KVM_CAP_IRQ_ROUTING
1374 kvm_context->irq_routes = qemu_mallocz(sizeof(*kvm_context->irq_routes));
1375 kvm_context->nr_allocated_irq_routes = 0;
1376 #endif
1378 kvm_state->vmfd = kvm_ioctl(kvm_state, KVM_CREATE_VM, 0);
1379 if (kvm_state->vmfd < 0) {
1380 fprintf(stderr, "kvm_create_vm: %m\n");
1381 kvm_finalize(kvm_state);
1382 return -1;
1385 r = kvm_arch_init(kvm_state);
1386 if (r < 0) {
1387 kvm_finalize(kvm_state);
1388 return r;
1391 kvm_create_irqchip(kvm_context);
1393 /* There was a nasty bug in < kvm-80 that prevents memory slots from being
1394 * destroyed properly. Since we rely on this capability, refuse to work
1395 * with any kernel without this capability. */
1396 if (!kvm_check_extension(kvm_state, KVM_CAP_DESTROY_MEMORY_REGION_WORKS)) {
1397 fprintf(stderr,
1398 "KVM kernel module broken (DESTROY_MEMORY_REGION).\n%s",
1399 upgrade_note);
1400 return -EINVAL;
1403 r = kvm_arch_init_irq_routing();
1404 if (r < 0) {
1405 return r;
1408 kvm_state->coalesced_mmio = 0;
1409 #ifdef KVM_CAP_COALESCED_MMIO
1410 kvm_state->coalesced_mmio =
1411 kvm_check_extension(kvm_state, KVM_CAP_COALESCED_MMIO);
1412 #endif
1414 kvm_state->vcpu_events = 0;
1415 #ifdef KVM_CAP_VCPU_EVENTS
1416 kvm_state->vcpu_events = kvm_check_extension(kvm_state, KVM_CAP_VCPU_EVENTS);
1417 #endif
1419 kvm_state->debugregs = 0;
1420 #ifdef KVM_CAP_DEBUGREGS
1421 kvm_state->debugregs = kvm_check_extension(kvm_state, KVM_CAP_DEBUGREGS);
1422 #endif
1424 kvm_state->xsave = 0;
1425 #ifdef KVM_CAP_XSAVE
1426 kvm_state->xsave = kvm_check_extension(kvm_state, KVM_CAP_XSAVE);
1427 #endif
1429 kvm_state->xcrs = 0;
1430 #ifdef KVM_CAP_XCRS
1431 kvm_state->xcrs = kvm_check_extension(kvm_state, KVM_CAP_XCRS);
1432 #endif
1434 kvm_state->many_ioeventfds = kvm_check_many_ioeventfds();
1436 kvm_init_ap();
1437 if (kvm_irqchip) {
1438 if (!qemu_kvm_has_gsi_routing()) {
1439 irq0override = 0;
1440 #ifdef TARGET_I386
1441 /* if kernel can't do irq routing, interrupt source
1442 * override 0->2 can not be set up as required by hpet,
1443 * so disable hpet.
1445 no_hpet = 1;
1446 } else if (!qemu_kvm_has_pit_state2()) {
1447 no_hpet = 1;
1449 #else
1451 #endif
1454 return 0;
1457 #ifdef KVM_CAP_IRQCHIP
1459 int kvm_set_irq(int irq, int level, int *status)
1461 return kvm_set_irq_level(kvm_context, irq, level, status);
1464 #endif
1466 static void kvm_mutex_unlock(void)
1468 assert(!cpu_single_env);
1469 pthread_mutex_unlock(&qemu_mutex);
1472 static void kvm_mutex_lock(void)
1474 pthread_mutex_lock(&qemu_mutex);
1475 cpu_single_env = NULL;
1478 void qemu_mutex_unlock_iothread(void)
1480 if (kvm_enabled()) {
1481 kvm_mutex_unlock();
1485 void qemu_mutex_lock_iothread(void)
1487 if (kvm_enabled()) {
1488 kvm_mutex_lock();
1492 #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
1493 void kvm_add_ioperm_data(struct ioperm_data *data)
1495 QLIST_INSERT_HEAD(&ioperm_head, data, entries);
1498 void kvm_remove_ioperm_data(unsigned long start_port, unsigned long num)
1500 struct ioperm_data *data;
1502 data = QLIST_FIRST(&ioperm_head);
1503 while (data) {
1504 struct ioperm_data *next = QLIST_NEXT(data, entries);
1506 if (data->start_port == start_port && data->num == num) {
1507 QLIST_REMOVE(data, entries);
1508 qemu_free(data);
1511 data = next;
1515 void kvm_ioperm(CPUState *env, void *data)
1517 if (kvm_enabled() && qemu_system_ready) {
1518 on_vcpu(env, kvm_arch_do_ioperm, data);
1522 #endif
1524 int kvm_set_boot_cpu_id(uint32_t id)
1526 return kvm_set_boot_vcpu_id(kvm_context, id);