qemu-kvm: Merge kvm_context into KVMState
[qemu-kvm.git] / qemu-kvm.c
blobdf8e81710279ff334eca54c17a008df45850b6ed
1 /*
2 * qemu/kvm integration
4 * Copyright (C) 2006-2008 Qumranet Technologies
6 * Licensed under the terms of the GNU GPL version 2 or higher.
7 */
8 #include "config.h"
9 #include "config-host.h"
11 #include <assert.h>
12 #include <string.h>
13 #include "hw/hw.h"
14 #include "sysemu.h"
15 #include "qemu-common.h"
16 #include "console.h"
17 #include "block.h"
18 #include "compatfd.h"
19 #include "gdbstub.h"
20 #include "monitor.h"
21 #include "cpus.h"
22 #include "qemu-thread.h"
24 #include "qemu-kvm.h"
26 #include <pthread.h>
27 #include <sys/utsname.h>
28 #include <sys/syscall.h>
29 #include <sys/mman.h>
30 #include <sys/ioctl.h>
31 #include "compatfd.h"
32 #include <sys/prctl.h>
34 #ifndef PR_MCE_KILL
35 #define PR_MCE_KILL 33
36 #endif
38 #ifndef BUS_MCEERR_AR
39 #define BUS_MCEERR_AR 4
40 #endif
41 #ifndef BUS_MCEERR_AO
42 #define BUS_MCEERR_AO 5
43 #endif
45 #define EXPECTED_KVM_API_VERSION 12
47 #if EXPECTED_KVM_API_VERSION != KVM_API_VERSION
48 #error libkvm: userspace and kernel version mismatch
49 #endif
51 int kvm_irqchip = 1;
52 int kvm_pit = 1;
53 int kvm_pit_reinject = 1;
54 int kvm_nested = 0;
56 pthread_mutex_t qemu_mutex = PTHREAD_MUTEX_INITIALIZER;
57 pthread_cond_t qemu_vcpu_cond = PTHREAD_COND_INITIALIZER;
58 pthread_cond_t qemu_system_cond = PTHREAD_COND_INITIALIZER;
59 pthread_cond_t qemu_pause_cond = PTHREAD_COND_INITIALIZER;
60 pthread_cond_t qemu_work_cond = PTHREAD_COND_INITIALIZER;
61 __thread CPUState *current_env;
63 static int qemu_system_ready;
65 #define SIG_IPI (SIGRTMIN+4)
67 CPUState *kvm_debug_cpu_requested;
69 #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
70 /* The list of ioperm_data */
71 static QLIST_HEAD(, ioperm_data) ioperm_head;
72 #endif
74 #define ALIGN(x, y) (((x)+(y)-1) & ~((y)-1))
76 static int handle_unhandled(uint64_t reason)
78 fprintf(stderr, "kvm: unhandled exit %" PRIx64 "\n", reason);
79 return -EINVAL;
82 #define VMX_INVALID_GUEST_STATE 0x80000021
84 static int handle_failed_vmentry(uint64_t reason)
86 fprintf(stderr, "kvm: vm entry failed with error 0x%" PRIx64 "\n\n", reason);
88 /* Perhaps we will need to check if this machine is intel since exit reason 0x21
89 has a different interpretation on SVM */
90 if (reason == VMX_INVALID_GUEST_STATE) {
91 fprintf(stderr, "If you're runnning a guest on an Intel machine without\n");
92 fprintf(stderr, "unrestricted mode support, the failure can be most likely\n");
93 fprintf(stderr, "due to the guest entering an invalid state for Intel VT.\n");
94 fprintf(stderr, "For example, the guest maybe running in big real mode\n");
95 fprintf(stderr, "which is not supported on less recent Intel processors.\n\n");
98 return -EINVAL;
101 static inline void set_gsi(KVMState *s, unsigned int gsi)
103 uint32_t *bitmap = s->used_gsi_bitmap;
105 if (gsi < s->max_gsi) {
106 bitmap[gsi / 32] |= 1U << (gsi % 32);
107 } else {
108 DPRINTF("Invalid GSI %u\n", gsi);
112 static inline void clear_gsi(KVMState *s, unsigned int gsi)
114 uint32_t *bitmap = s->used_gsi_bitmap;
116 if (gsi < s->max_gsi) {
117 bitmap[gsi / 32] &= ~(1U << (gsi % 32));
118 } else {
119 DPRINTF("Invalid GSI %u\n", gsi);
123 static int kvm_create_context(void);
125 int kvm_init(void)
127 int fd;
128 int r, i;
131 fd = open("/dev/kvm", O_RDWR);
132 if (fd == -1) {
133 perror("open /dev/kvm");
134 return -1;
136 r = ioctl(fd, KVM_GET_API_VERSION, 0);
137 if (r == -1) {
138 fprintf(stderr,
139 "kvm kernel version too old: "
140 "KVM_GET_API_VERSION ioctl not supported\n");
141 goto out_close;
143 if (r < EXPECTED_KVM_API_VERSION) {
144 fprintf(stderr, "kvm kernel version too old: "
145 "We expect API version %d or newer, but got "
146 "version %d\n", EXPECTED_KVM_API_VERSION, r);
147 goto out_close;
149 if (r > EXPECTED_KVM_API_VERSION) {
150 fprintf(stderr, "kvm userspace version too old\n");
151 goto out_close;
153 kvm_state = qemu_mallocz(sizeof(*kvm_state));
155 kvm_state->fd = fd;
156 kvm_state->vmfd = -1;
158 #ifdef KVM_CAP_SET_GUEST_DEBUG
159 QTAILQ_INIT(&kvm_state->kvm_sw_breakpoints);
160 #endif
162 for (i = 0; i < ARRAY_SIZE(kvm_state->slots); i++) {
163 kvm_state->slots[i].slot = i;
166 #ifdef KVM_CAP_USER_MEMORY
167 r = kvm_ioctl(kvm_state, KVM_CHECK_EXTENSION, KVM_CAP_USER_MEMORY);
168 if (r <= 0) {
169 fprintf(stderr,
170 "Hypervisor too old: KVM_CAP_USER_MEMORY extension not supported\n");
171 goto out_close;
173 #else
174 #error Hypervisor too old: KVM_CAP_USER_MEMORY extension not supported
175 #endif
177 cpu_register_phys_memory_client(&kvm_cpu_phys_memory_client);
179 pthread_mutex_lock(&qemu_mutex);
180 return kvm_create_context();
182 out_close:
183 close(fd);
184 return -1;
187 static void kvm_finalize(KVMState *s)
189 /* FIXME
190 if (kvm->vcpu_fd[0] != -1)
191 close(kvm->vcpu_fd[0]);
192 if (kvm->vm_fd != -1)
193 close(kvm->vm_fd);
195 close(s->fd);
196 free(s);
199 static int kvm_init_irq_routing(KVMState *s)
201 #ifdef KVM_CAP_IRQ_ROUTING
202 int r, gsi_count;
204 gsi_count = kvm_check_extension(s, KVM_CAP_IRQ_ROUTING);
205 if (gsi_count > 0) {
206 int gsi_bits, i;
208 /* Round up so we can search ints using ffs */
209 gsi_bits = ALIGN(gsi_count, 32);
210 s->used_gsi_bitmap = qemu_mallocz(gsi_bits / 8);
211 s->max_gsi = gsi_bits;
213 /* Mark any over-allocated bits as already in use */
214 for (i = gsi_count; i < gsi_bits; i++) {
215 set_gsi(s, i);
219 s->irq_routes = qemu_mallocz(sizeof(*s->irq_routes));
220 s->nr_allocated_irq_routes = 0;
222 r = kvm_arch_init_irq_routing();
223 if (r < 0) {
224 return r;
226 #endif
228 return 0;
231 int kvm_create_irqchip(KVMState *s)
233 #ifdef KVM_CAP_IRQCHIP
234 int r;
236 if (!kvm_irqchip || !kvm_check_extension(s, KVM_CAP_IRQCHIP)) {
237 return 0;
240 r = kvm_vm_ioctl(s, KVM_CREATE_IRQCHIP);
241 if (r < 0) {
242 fprintf(stderr, "Create kernel PIC irqchip failed\n");
243 return r;
246 s->irqchip_inject_ioctl = KVM_IRQ_LINE;
247 #if defined(KVM_CAP_IRQ_INJECT_STATUS) && defined(KVM_IRQ_LINE_STATUS)
248 if (kvm_check_extension(s, KVM_CAP_IRQ_INJECT_STATUS)) {
249 s->irqchip_inject_ioctl = KVM_IRQ_LINE_STATUS;
251 #endif
252 s->irqchip_in_kernel = 1;
254 r = kvm_init_irq_routing(s);
255 if (r < 0) {
256 return r;
258 #endif
260 return 0;
263 #ifdef KVM_CAP_IRQCHIP
265 int kvm_set_irq(int irq, int level, int *status)
267 struct kvm_irq_level event;
268 int r;
270 if (!kvm_state->irqchip_in_kernel) {
271 return 0;
273 event.level = level;
274 event.irq = irq;
275 r = kvm_vm_ioctl(kvm_state, kvm_state->irqchip_inject_ioctl,
276 &event);
277 if (r < 0) {
278 perror("kvm_set_irq");
281 if (status) {
282 #ifdef KVM_CAP_IRQ_INJECT_STATUS
283 *status = (kvm_state->irqchip_inject_ioctl == KVM_IRQ_LINE) ?
284 1 : event.status;
285 #else
286 *status = 1;
287 #endif
290 return 1;
293 int kvm_get_irqchip(KVMState *s, struct kvm_irqchip *chip)
295 int r;
297 if (!s->irqchip_in_kernel) {
298 return 0;
300 r = kvm_vm_ioctl(s, KVM_GET_IRQCHIP, chip);
301 if (r < 0) {
302 perror("kvm_get_irqchip\n");
304 return r;
307 int kvm_set_irqchip(KVMState *s, struct kvm_irqchip *chip)
309 int r;
311 if (!s->irqchip_in_kernel) {
312 return 0;
314 r = kvm_vm_ioctl(s, KVM_SET_IRQCHIP, chip);
315 if (r < 0) {
316 perror("kvm_set_irqchip\n");
318 return r;
321 #endif
323 static int handle_mmio(CPUState *env)
325 unsigned long addr = env->kvm_run->mmio.phys_addr;
326 struct kvm_run *kvm_run = env->kvm_run;
327 void *data = kvm_run->mmio.data;
329 /* hack: Red Hat 7.1 generates these weird accesses. */
330 if ((addr > 0xa0000 - 4 && addr <= 0xa0000) && kvm_run->mmio.len == 3) {
331 return 0;
334 cpu_physical_memory_rw(addr, data, kvm_run->mmio.len, kvm_run->mmio.is_write);
335 return 0;
338 static int handle_shutdown(CPUState *env)
340 /* stop the current vcpu from going back to guest mode */
341 env->stopped = 1;
343 qemu_system_reset_request();
344 return 1;
347 static inline void push_nmi(void)
349 #ifdef KVM_CAP_USER_NMI
350 kvm_arch_push_nmi();
351 #endif /* KVM_CAP_USER_NMI */
354 static void post_kvm_run(CPUState *env)
356 pthread_mutex_lock(&qemu_mutex);
357 kvm_arch_post_run(env, env->kvm_run);
358 cpu_single_env = env;
361 static int pre_kvm_run(CPUState *env)
363 kvm_arch_pre_run(env, env->kvm_run);
365 pthread_mutex_unlock(&qemu_mutex);
366 return 0;
369 int kvm_is_ready_for_interrupt_injection(CPUState *env)
371 return env->kvm_run->ready_for_interrupt_injection;
374 static int kvm_run(CPUState *env)
376 int r;
377 struct kvm_run *run = env->kvm_run;
378 int fd = env->kvm_fd;
380 again:
381 if (env->kvm_vcpu_dirty) {
382 kvm_arch_put_registers(env, KVM_PUT_RUNTIME_STATE);
383 env->kvm_vcpu_dirty = 0;
385 push_nmi();
386 #if !defined(__s390__)
387 if (!kvm_state->irqchip_in_kernel) {
388 run->request_interrupt_window = kvm_arch_try_push_interrupts(env);
390 #endif
392 r = pre_kvm_run(env);
393 if (r) {
394 return r;
396 if (env->exit_request) {
397 env->exit_request = 0;
398 pthread_kill(env->thread->thread, SIG_IPI);
400 r = ioctl(fd, KVM_RUN, 0);
402 if (r == -1 && errno != EINTR && errno != EAGAIN) {
403 r = -errno;
404 post_kvm_run(env);
405 fprintf(stderr, "kvm_run: %s\n", strerror(-r));
406 return r;
409 post_kvm_run(env);
411 kvm_flush_coalesced_mmio_buffer();
413 #if !defined(__s390__)
414 if (r == -1) {
415 return 1;
417 #endif
418 if (1) {
419 switch (run->exit_reason) {
420 case KVM_EXIT_UNKNOWN:
421 r = handle_unhandled(run->hw.hardware_exit_reason);
422 break;
423 case KVM_EXIT_FAIL_ENTRY:
424 r = handle_failed_vmentry(run->fail_entry.hardware_entry_failure_reason);
425 break;
426 case KVM_EXIT_EXCEPTION:
427 fprintf(stderr, "exception %d (%x)\n", run->ex.exception,
428 run->ex.error_code);
429 cpu_dump_state(env, stderr, fprintf, CPU_DUMP_CODE);
430 abort();
431 break;
432 case KVM_EXIT_IO:
433 kvm_handle_io(run->io.port,
434 (uint8_t *)run + run->io.data_offset,
435 run->io.direction,
436 run->io.size,
437 run->io.count);
438 r = 0;
439 break;
440 case KVM_EXIT_MMIO:
441 r = handle_mmio(env);
442 break;
443 case KVM_EXIT_HLT:
444 r = kvm_arch_halt(env);
445 break;
446 case KVM_EXIT_IRQ_WINDOW_OPEN:
447 break;
448 case KVM_EXIT_SHUTDOWN:
449 r = handle_shutdown(env);
450 break;
451 #if defined(__s390__)
452 case KVM_EXIT_S390_SIEIC:
453 r = kvm_s390_handle_intercept(kvm, env, run);
454 break;
455 case KVM_EXIT_S390_RESET:
456 r = kvm_s390_handle_reset(kvm, env, run);
457 break;
458 #endif
459 case KVM_EXIT_INTERNAL_ERROR:
460 r = kvm_handle_internal_error(env, run);
461 break;
462 default:
463 r = kvm_arch_run(env);
464 if (r < 0) {
465 fprintf(stderr, "unhandled vm exit: 0x%x\n", run->exit_reason);
466 cpu_dump_state(env, stderr, fprintf, CPU_DUMP_CODE);
467 abort();
469 if (r > 0) {
470 return r;
472 break;
475 if (!r) {
476 goto again;
478 return r;
481 int kvm_inject_irq(CPUState *env, unsigned irq)
483 struct kvm_interrupt intr;
485 intr.irq = irq;
486 return kvm_vcpu_ioctl(env, KVM_INTERRUPT, &intr);
489 int kvm_inject_nmi(CPUState *env)
491 #ifdef KVM_CAP_USER_NMI
492 return kvm_vcpu_ioctl(env, KVM_NMI);
493 #else
494 return -ENOSYS;
495 #endif
498 #ifdef KVM_CAP_DEVICE_ASSIGNMENT
499 int kvm_assign_pci_device(KVMState *s,
500 struct kvm_assigned_pci_dev *assigned_dev)
502 return kvm_vm_ioctl(s, KVM_ASSIGN_PCI_DEVICE, assigned_dev);
505 static int kvm_old_assign_irq(KVMState *s,
506 struct kvm_assigned_irq *assigned_irq)
508 return kvm_vm_ioctl(s, KVM_ASSIGN_IRQ, assigned_irq);
511 #ifdef KVM_CAP_ASSIGN_DEV_IRQ
512 int kvm_assign_irq(KVMState *s, struct kvm_assigned_irq *assigned_irq)
514 int ret;
516 ret = kvm_ioctl(s, KVM_CHECK_EXTENSION, KVM_CAP_ASSIGN_DEV_IRQ);
517 if (ret > 0) {
518 return kvm_vm_ioctl(s, KVM_ASSIGN_DEV_IRQ, assigned_irq);
521 return kvm_old_assign_irq(s, assigned_irq);
524 int kvm_deassign_irq(KVMState *s, struct kvm_assigned_irq *assigned_irq)
526 return kvm_vm_ioctl(s, KVM_DEASSIGN_DEV_IRQ, assigned_irq);
528 #else
529 int kvm_assign_irq(KVMState *s, struct kvm_assigned_irq *assigned_irq)
531 return kvm_old_assign_irq(s, assigned_irq);
533 #endif
534 #endif
536 #ifdef KVM_CAP_DEVICE_DEASSIGNMENT
537 int kvm_deassign_pci_device(KVMState *s,
538 struct kvm_assigned_pci_dev *assigned_dev)
540 return kvm_vm_ioctl(s, KVM_DEASSIGN_PCI_DEVICE, assigned_dev);
542 #endif
544 int kvm_reinject_control(KVMState *s, int pit_reinject)
546 #ifdef KVM_CAP_REINJECT_CONTROL
547 int r;
548 struct kvm_reinject_control control;
550 control.pit_reinject = pit_reinject;
552 r = kvm_ioctl(s, KVM_CHECK_EXTENSION, KVM_CAP_REINJECT_CONTROL);
553 if (r > 0) {
554 return kvm_vm_ioctl(s, KVM_REINJECT_CONTROL, &control);
556 #endif
557 return -ENOSYS;
560 int kvm_has_gsi_routing(void)
562 int r = 0;
564 #ifdef KVM_CAP_IRQ_ROUTING
565 r = kvm_check_extension(kvm_state, KVM_CAP_IRQ_ROUTING);
566 #endif
567 return r;
570 int kvm_clear_gsi_routes(void)
572 #ifdef KVM_CAP_IRQ_ROUTING
573 kvm_state->irq_routes->nr = 0;
574 return 0;
575 #else
576 return -EINVAL;
577 #endif
580 int kvm_add_routing_entry(struct kvm_irq_routing_entry *entry)
582 #ifdef KVM_CAP_IRQ_ROUTING
583 KVMState *s = kvm_state;
584 struct kvm_irq_routing *z;
585 struct kvm_irq_routing_entry *new;
586 int n, size;
588 if (s->irq_routes->nr == s->nr_allocated_irq_routes) {
589 n = s->nr_allocated_irq_routes * 2;
590 if (n < 64) {
591 n = 64;
593 size = sizeof(struct kvm_irq_routing);
594 size += n * sizeof(*new);
595 z = realloc(s->irq_routes, size);
596 if (!z) {
597 return -ENOMEM;
599 s->nr_allocated_irq_routes = n;
600 s->irq_routes = z;
602 n = s->irq_routes->nr++;
603 new = &s->irq_routes->entries[n];
604 memset(new, 0, sizeof(*new));
605 new->gsi = entry->gsi;
606 new->type = entry->type;
607 new->flags = entry->flags;
608 new->u = entry->u;
610 set_gsi(s, entry->gsi);
612 return 0;
613 #else
614 return -ENOSYS;
615 #endif
618 int kvm_add_irq_route(int gsi, int irqchip, int pin)
620 #ifdef KVM_CAP_IRQ_ROUTING
621 struct kvm_irq_routing_entry e;
623 e.gsi = gsi;
624 e.type = KVM_IRQ_ROUTING_IRQCHIP;
625 e.flags = 0;
626 e.u.irqchip.irqchip = irqchip;
627 e.u.irqchip.pin = pin;
628 return kvm_add_routing_entry(&e);
629 #else
630 return -ENOSYS;
631 #endif
634 int kvm_del_routing_entry(struct kvm_irq_routing_entry *entry)
636 #ifdef KVM_CAP_IRQ_ROUTING
637 KVMState *s = kvm_state;
638 struct kvm_irq_routing_entry *e, *p;
639 int i, gsi, found = 0;
641 gsi = entry->gsi;
643 for (i = 0; i < s->irq_routes->nr; ++i) {
644 e = &s->irq_routes->entries[i];
645 if (e->type == entry->type && e->gsi == gsi) {
646 switch (e->type) {
647 case KVM_IRQ_ROUTING_IRQCHIP:{
648 if (e->u.irqchip.irqchip ==
649 entry->u.irqchip.irqchip
650 && e->u.irqchip.pin == entry->u.irqchip.pin) {
651 p = &s->irq_routes->entries[--s->irq_routes->nr];
652 *e = *p;
653 found = 1;
655 break;
657 case KVM_IRQ_ROUTING_MSI:{
658 if (e->u.msi.address_lo ==
659 entry->u.msi.address_lo
660 && e->u.msi.address_hi ==
661 entry->u.msi.address_hi
662 && e->u.msi.data == entry->u.msi.data) {
663 p = &s->irq_routes->entries[--s->irq_routes->nr];
664 *e = *p;
665 found = 1;
667 break;
669 default:
670 break;
672 if (found) {
673 /* If there are no other users of this GSI
674 * mark it available in the bitmap */
675 for (i = 0; i < s->irq_routes->nr; i++) {
676 e = &s->irq_routes->entries[i];
677 if (e->gsi == gsi)
678 break;
680 if (i == s->irq_routes->nr) {
681 clear_gsi(s, gsi);
684 return 0;
688 return -ESRCH;
689 #else
690 return -ENOSYS;
691 #endif
694 int kvm_update_routing_entry(struct kvm_irq_routing_entry *entry,
695 struct kvm_irq_routing_entry *newentry)
697 #ifdef KVM_CAP_IRQ_ROUTING
698 KVMState *s = kvm_state;
699 struct kvm_irq_routing_entry *e;
700 int i;
702 if (entry->gsi != newentry->gsi || entry->type != newentry->type) {
703 return -EINVAL;
706 for (i = 0; i < s->irq_routes->nr; ++i) {
707 e = &s->irq_routes->entries[i];
708 if (e->type != entry->type || e->gsi != entry->gsi) {
709 continue;
711 switch (e->type) {
712 case KVM_IRQ_ROUTING_IRQCHIP:
713 if (e->u.irqchip.irqchip == entry->u.irqchip.irqchip &&
714 e->u.irqchip.pin == entry->u.irqchip.pin) {
715 memcpy(&e->u.irqchip, &newentry->u.irqchip,
716 sizeof e->u.irqchip);
717 return 0;
719 break;
720 case KVM_IRQ_ROUTING_MSI:
721 if (e->u.msi.address_lo == entry->u.msi.address_lo &&
722 e->u.msi.address_hi == entry->u.msi.address_hi &&
723 e->u.msi.data == entry->u.msi.data) {
724 memcpy(&e->u.msi, &newentry->u.msi, sizeof e->u.msi);
725 return 0;
727 break;
728 default:
729 break;
732 return -ESRCH;
733 #else
734 return -ENOSYS;
735 #endif
738 int kvm_del_irq_route(int gsi, int irqchip, int pin)
740 #ifdef KVM_CAP_IRQ_ROUTING
741 struct kvm_irq_routing_entry e;
743 e.gsi = gsi;
744 e.type = KVM_IRQ_ROUTING_IRQCHIP;
745 e.flags = 0;
746 e.u.irqchip.irqchip = irqchip;
747 e.u.irqchip.pin = pin;
748 return kvm_del_routing_entry(&e);
749 #else
750 return -ENOSYS;
751 #endif
754 int kvm_commit_irq_routes(void)
756 #ifdef KVM_CAP_IRQ_ROUTING
757 KVMState *s = kvm_state;
759 s->irq_routes->flags = 0;
760 return kvm_vm_ioctl(s, KVM_SET_GSI_ROUTING, s->irq_routes);
761 #else
762 return -ENOSYS;
763 #endif
766 int kvm_get_irq_route_gsi(void)
768 KVMState *s = kvm_state;
769 int i, bit;
770 uint32_t *buf = s->used_gsi_bitmap;
772 /* Return the lowest unused GSI in the bitmap */
773 for (i = 0; i < s->max_gsi / 32; i++) {
774 bit = ffs(~buf[i]);
775 if (!bit) {
776 continue;
779 return bit - 1 + i * 32;
782 return -ENOSPC;
785 static void kvm_msi_routing_entry(struct kvm_irq_routing_entry *e,
786 KVMMsiMessage *msg)
789 e->gsi = msg->gsi;
790 e->type = KVM_IRQ_ROUTING_MSI;
791 e->flags = 0;
792 e->u.msi.address_lo = msg->addr_lo;
793 e->u.msi.address_hi = msg->addr_hi;
794 e->u.msi.data = msg->data;
797 int kvm_msi_message_add(KVMMsiMessage *msg)
799 struct kvm_irq_routing_entry e;
800 int ret;
802 ret = kvm_get_irq_route_gsi();
803 if (ret < 0) {
804 return ret;
806 msg->gsi = ret;
808 kvm_msi_routing_entry(&e, msg);
809 return kvm_add_routing_entry(&e);
812 int kvm_msi_message_del(KVMMsiMessage *msg)
814 struct kvm_irq_routing_entry e;
816 kvm_msi_routing_entry(&e, msg);
817 return kvm_del_routing_entry(&e);
820 int kvm_msi_message_update(KVMMsiMessage *old, KVMMsiMessage *new)
822 struct kvm_irq_routing_entry e1, e2;
823 int ret;
825 new->gsi = old->gsi;
826 if (memcmp(old, new, sizeof(KVMMsiMessage)) == 0) {
827 return 0;
830 kvm_msi_routing_entry(&e1, old);
831 kvm_msi_routing_entry(&e2, new);
833 ret = kvm_update_routing_entry(&e1, &e2);
834 if (ret < 0) {
835 return ret;
838 return 1;
842 #ifdef KVM_CAP_DEVICE_MSIX
843 int kvm_assign_set_msix_nr(KVMState *s, struct kvm_assigned_msix_nr *msix_nr)
845 return kvm_vm_ioctl(s, KVM_ASSIGN_SET_MSIX_NR, msix_nr);
848 int kvm_assign_set_msix_entry(KVMState *s,
849 struct kvm_assigned_msix_entry *entry)
851 return kvm_vm_ioctl(s, KVM_ASSIGN_SET_MSIX_ENTRY, entry);
853 #endif
855 unsigned long kvm_get_thread_id(void)
857 return syscall(SYS_gettid);
860 static void kvm_cond_wait(pthread_cond_t *cond)
862 CPUState *env = cpu_single_env;
864 pthread_cond_wait(cond, &qemu_mutex);
865 cpu_single_env = env;
868 static void sig_ipi_handler(int n)
872 static void sigbus_reraise(void)
874 sigset_t set;
875 struct sigaction action;
877 memset(&action, 0, sizeof(action));
878 action.sa_handler = SIG_DFL;
879 if (!sigaction(SIGBUS, &action, NULL)) {
880 raise(SIGBUS);
881 sigemptyset(&set);
882 sigaddset(&set, SIGBUS);
883 sigprocmask(SIG_UNBLOCK, &set, NULL);
885 perror("Failed to re-raise SIGBUS!\n");
886 abort();
889 static void sigbus_handler(int n, struct qemu_signalfd_siginfo *siginfo,
890 void *ctx)
892 if (kvm_on_sigbus(siginfo->ssi_code, (void *)(intptr_t)siginfo->ssi_addr))
893 sigbus_reraise();
896 void on_vcpu(CPUState *env, void (*func)(void *data), void *data)
898 struct qemu_work_item wi;
900 if (env == current_env) {
901 func(data);
902 return;
905 wi.func = func;
906 wi.data = data;
907 if (!env->kvm_cpu_state.queued_work_first) {
908 env->kvm_cpu_state.queued_work_first = &wi;
909 } else {
910 env->kvm_cpu_state.queued_work_last->next = &wi;
912 env->kvm_cpu_state.queued_work_last = &wi;
913 wi.next = NULL;
914 wi.done = false;
916 pthread_kill(env->thread->thread, SIG_IPI);
917 while (!wi.done) {
918 kvm_cond_wait(&qemu_work_cond);
922 void kvm_update_interrupt_request(CPUState *env)
924 int signal = 0;
926 if (env) {
927 if (!current_env || !current_env->created) {
928 signal = 1;
931 * Testing for created here is really redundant
933 if (current_env && current_env->created &&
934 env != current_env && !env->kvm_cpu_state.signalled) {
935 signal = 1;
938 if (signal) {
939 env->kvm_cpu_state.signalled = 1;
940 if (env->thread) {
941 pthread_kill(env->thread->thread, SIG_IPI);
947 int kvm_cpu_exec(CPUState *env)
949 int r;
951 r = kvm_run(env);
952 if (r < 0) {
953 printf("kvm_run returned %d\n", r);
954 cpu_dump_state(env, stderr, fprintf, CPU_DUMP_CODE);
955 vm_stop(VMSTOP_PANIC);
958 return 0;
961 static int kvm_cpu_is_stopped(CPUState *env)
963 return !vm_running || env->stopped;
966 static void flush_queued_work(CPUState *env)
968 struct qemu_work_item *wi;
970 if (!env->kvm_cpu_state.queued_work_first) {
971 return;
974 while ((wi = env->kvm_cpu_state.queued_work_first)) {
975 env->kvm_cpu_state.queued_work_first = wi->next;
976 wi->func(wi->data);
977 wi->done = true;
979 env->kvm_cpu_state.queued_work_last = NULL;
980 pthread_cond_broadcast(&qemu_work_cond);
983 static void kvm_main_loop_wait(CPUState *env, int timeout)
985 struct timespec ts;
986 int r, e;
987 siginfo_t siginfo;
988 sigset_t waitset;
989 sigset_t chkset;
991 ts.tv_sec = timeout / 1000;
992 ts.tv_nsec = (timeout % 1000) * 1000000;
993 sigemptyset(&waitset);
994 sigaddset(&waitset, SIG_IPI);
995 sigaddset(&waitset, SIGBUS);
997 do {
998 pthread_mutex_unlock(&qemu_mutex);
1000 r = sigtimedwait(&waitset, &siginfo, &ts);
1001 e = errno;
1003 pthread_mutex_lock(&qemu_mutex);
1005 if (r == -1 && !(e == EAGAIN || e == EINTR)) {
1006 printf("sigtimedwait: %s\n", strerror(e));
1007 exit(1);
1010 switch (r) {
1011 case SIGBUS:
1012 if (kvm_on_sigbus_vcpu(env, siginfo.si_code, siginfo.si_addr))
1013 sigbus_reraise();
1014 break;
1015 default:
1016 break;
1019 r = sigpending(&chkset);
1020 if (r == -1) {
1021 printf("sigpending: %s\n", strerror(e));
1022 exit(1);
1024 } while (sigismember(&chkset, SIG_IPI) || sigismember(&chkset, SIGBUS));
1026 cpu_single_env = env;
1027 flush_queued_work(env);
1029 if (env->stop) {
1030 env->stop = 0;
1031 env->stopped = 1;
1032 pthread_cond_signal(&qemu_pause_cond);
1035 env->kvm_cpu_state.signalled = 0;
1038 static int all_threads_paused(void)
1040 CPUState *penv = first_cpu;
1042 while (penv) {
1043 if (penv->stop) {
1044 return 0;
1046 penv = (CPUState *) penv->next_cpu;
1049 return 1;
1052 static void pause_all_threads(void)
1054 CPUState *penv = first_cpu;
1056 while (penv) {
1057 if (penv != cpu_single_env) {
1058 penv->stop = 1;
1059 pthread_kill(penv->thread->thread, SIG_IPI);
1060 } else {
1061 penv->stop = 0;
1062 penv->stopped = 1;
1063 cpu_exit(penv);
1065 penv = (CPUState *) penv->next_cpu;
1068 while (!all_threads_paused()) {
1069 kvm_cond_wait(&qemu_pause_cond);
1073 static void resume_all_threads(void)
1075 CPUState *penv = first_cpu;
1077 assert(!cpu_single_env);
1079 while (penv) {
1080 penv->stop = 0;
1081 penv->stopped = 0;
1082 pthread_kill(penv->thread->thread, SIG_IPI);
1083 penv = (CPUState *) penv->next_cpu;
1087 static void kvm_vm_state_change_handler(void *context, int running, int reason)
1089 if (running) {
1090 resume_all_threads();
1091 } else {
1092 pause_all_threads();
1096 static void setup_kernel_sigmask(CPUState *env)
1098 sigset_t set;
1100 sigemptyset(&set);
1101 sigaddset(&set, SIGUSR2);
1102 sigaddset(&set, SIGIO);
1103 sigaddset(&set, SIGALRM);
1104 sigprocmask(SIG_BLOCK, &set, NULL);
1106 sigprocmask(SIG_BLOCK, NULL, &set);
1107 sigdelset(&set, SIG_IPI);
1108 sigdelset(&set, SIGBUS);
1110 kvm_set_signal_mask(env, &set);
1113 static void qemu_kvm_system_reset(void)
1115 pause_all_threads();
1117 cpu_synchronize_all_states();
1118 qemu_system_reset();
1120 resume_all_threads();
1123 static void process_irqchip_events(CPUState *env)
1125 kvm_arch_process_irqchip_events(env);
1126 if (kvm_arch_has_work(env))
1127 env->halted = 0;
1130 static int kvm_main_loop_cpu(CPUState *env)
1132 while (1) {
1133 int run_cpu = !kvm_cpu_is_stopped(env);
1134 if (run_cpu) {
1135 kvm_arch_process_async_events(env);
1136 if (!kvm_irqchip_in_kernel()) {
1137 process_irqchip_events(env);
1138 run_cpu = !env->halted;
1141 if (run_cpu) {
1142 kvm_cpu_exec(env);
1143 kvm_main_loop_wait(env, 0);
1144 } else {
1145 kvm_main_loop_wait(env, 1000);
1148 pthread_mutex_unlock(&qemu_mutex);
1149 return 0;
1152 static void *ap_main_loop(void *_env)
1154 CPUState *env = _env;
1155 #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
1156 struct ioperm_data *data = NULL;
1157 #endif
1159 current_env = env;
1160 env->thread_id = kvm_get_thread_id();
1162 #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
1163 /* do ioperm for io ports of assigned devices */
1164 QLIST_FOREACH(data, &ioperm_head, entries)
1165 on_vcpu(env, kvm_arch_do_ioperm, data);
1166 #endif
1168 pthread_mutex_lock(&qemu_mutex);
1169 cpu_single_env = env;
1171 if (kvm_create_vcpu(env) < 0) {
1172 abort();
1174 setup_kernel_sigmask(env);
1176 /* signal VCPU creation */
1177 current_env->created = 1;
1178 pthread_cond_signal(&qemu_vcpu_cond);
1180 /* and wait for machine initialization */
1181 while (!qemu_system_ready) {
1182 kvm_cond_wait(&qemu_system_cond);
1185 /* re-initialize cpu_single_env after re-acquiring qemu_mutex */
1186 cpu_single_env = env;
1188 kvm_main_loop_cpu(env);
1189 return NULL;
1192 int kvm_init_vcpu(CPUState *env)
1194 env->thread = qemu_mallocz(sizeof(QemuThread));
1195 qemu_thread_create(env->thread, ap_main_loop, env);
1197 while (env->created == 0) {
1198 kvm_cond_wait(&qemu_vcpu_cond);
1201 return 0;
1204 #ifdef TARGET_I386
1205 void kvm_hpet_disable_kpit(void)
1207 struct kvm_pit_state2 ps2;
1209 kvm_get_pit2(kvm_state, &ps2);
1210 ps2.flags |= KVM_PIT_FLAGS_HPET_LEGACY;
1211 kvm_set_pit2(kvm_state, &ps2);
1214 void kvm_hpet_enable_kpit(void)
1216 struct kvm_pit_state2 ps2;
1218 kvm_get_pit2(kvm_state, &ps2);
1219 ps2.flags &= ~KVM_PIT_FLAGS_HPET_LEGACY;
1220 kvm_set_pit2(kvm_state, &ps2);
1222 #endif
1224 int kvm_init_ap(void)
1226 struct sigaction action;
1228 qemu_add_vm_change_state_handler(kvm_vm_state_change_handler, NULL);
1230 signal(SIG_IPI, sig_ipi_handler);
1232 memset(&action, 0, sizeof(action));
1233 action.sa_flags = SA_SIGINFO;
1234 action.sa_sigaction = (void (*)(int, siginfo_t*, void*))sigbus_handler;
1235 sigaction(SIGBUS, &action, NULL);
1236 prctl(PR_MCE_KILL, 1, 1, 0, 0);
1237 return 0;
1240 /* If we have signalfd, we mask out the signals we want to handle and then
1241 * use signalfd to listen for them. We rely on whatever the current signal
1242 * handler is to dispatch the signals when we receive them.
1245 static void sigfd_handler(void *opaque)
1247 int fd = (unsigned long) opaque;
1248 struct qemu_signalfd_siginfo info;
1249 struct sigaction action;
1250 ssize_t len;
1252 while (1) {
1253 do {
1254 len = read(fd, &info, sizeof(info));
1255 } while (len == -1 && errno == EINTR);
1257 if (len == -1 && errno == EAGAIN) {
1258 break;
1261 if (len != sizeof(info)) {
1262 printf("read from sigfd returned %zd: %m\n", len);
1263 return;
1266 sigaction(info.ssi_signo, NULL, &action);
1267 if ((action.sa_flags & SA_SIGINFO) && action.sa_sigaction) {
1268 action.sa_sigaction(info.ssi_signo,
1269 (siginfo_t *)&info, NULL);
1270 } else if (action.sa_handler) {
1271 action.sa_handler(info.ssi_signo);
1276 int kvm_main_loop(void)
1278 sigset_t mask;
1279 int sigfd;
1281 qemu_system_ready = 1;
1283 sigemptyset(&mask);
1284 sigaddset(&mask, SIGIO);
1285 sigaddset(&mask, SIGALRM);
1286 sigaddset(&mask, SIGBUS);
1287 sigprocmask(SIG_BLOCK, &mask, NULL);
1289 sigfd = qemu_signalfd(&mask);
1290 if (sigfd == -1) {
1291 fprintf(stderr, "failed to create signalfd\n");
1292 return -errno;
1295 fcntl(sigfd, F_SETFL, O_NONBLOCK);
1297 qemu_set_fd_handler2(sigfd, NULL, sigfd_handler, NULL,
1298 (void *)(unsigned long) sigfd);
1300 pthread_cond_broadcast(&qemu_system_cond);
1302 cpu_single_env = NULL;
1304 while (1) {
1305 main_loop_wait(0);
1306 if (qemu_shutdown_requested()) {
1307 monitor_protocol_event(QEVENT_SHUTDOWN, NULL);
1308 if (qemu_no_shutdown()) {
1309 vm_stop(VMSTOP_SHUTDOWN);
1310 } else {
1311 break;
1313 } else if (qemu_powerdown_requested()) {
1314 monitor_protocol_event(QEVENT_POWERDOWN, NULL);
1315 qemu_irq_raise(qemu_system_powerdown);
1316 } else if (qemu_reset_requested()) {
1317 qemu_kvm_system_reset();
1318 } else if (kvm_debug_cpu_requested) {
1319 gdb_set_stop_cpu(kvm_debug_cpu_requested);
1320 vm_stop(VMSTOP_DEBUG);
1321 kvm_debug_cpu_requested = NULL;
1325 bdrv_close_all();
1326 pause_all_threads();
1327 pthread_mutex_unlock(&qemu_mutex);
1329 return 0;
1332 #if !defined(TARGET_I386)
1333 int kvm_arch_init_irq_routing(void)
1335 return 0;
1337 #endif
1339 static int kvm_create_context(void)
1341 static const char upgrade_note[] =
1342 "Please upgrade to at least kernel 2.6.29 or recent kvm-kmod\n"
1343 "(see http://sourceforge.net/projects/kvm).\n";
1345 int r;
1347 kvm_state->pit_in_kernel = kvm_pit;
1349 kvm_state->vmfd = kvm_ioctl(kvm_state, KVM_CREATE_VM, 0);
1350 if (kvm_state->vmfd < 0) {
1351 fprintf(stderr, "kvm_create_vm: %m\n");
1352 kvm_finalize(kvm_state);
1353 return -1;
1356 r = kvm_arch_init(kvm_state);
1357 if (r < 0) {
1358 kvm_finalize(kvm_state);
1359 return r;
1362 /* There was a nasty bug in < kvm-80 that prevents memory slots from being
1363 * destroyed properly. Since we rely on this capability, refuse to work
1364 * with any kernel without this capability. */
1365 if (!kvm_check_extension(kvm_state, KVM_CAP_DESTROY_MEMORY_REGION_WORKS)) {
1366 fprintf(stderr,
1367 "KVM kernel module broken (DESTROY_MEMORY_REGION).\n%s",
1368 upgrade_note);
1369 return -EINVAL;
1372 r = kvm_create_irqchip(kvm_state);
1373 if (r < 0) {
1374 return r;
1377 kvm_state->coalesced_mmio = 0;
1378 #ifdef KVM_CAP_COALESCED_MMIO
1379 kvm_state->coalesced_mmio =
1380 kvm_check_extension(kvm_state, KVM_CAP_COALESCED_MMIO);
1381 #endif
1383 kvm_state->vcpu_events = 0;
1384 #ifdef KVM_CAP_VCPU_EVENTS
1385 kvm_state->vcpu_events = kvm_check_extension(kvm_state, KVM_CAP_VCPU_EVENTS);
1386 #endif
1388 kvm_state->debugregs = 0;
1389 #ifdef KVM_CAP_DEBUGREGS
1390 kvm_state->debugregs = kvm_check_extension(kvm_state, KVM_CAP_DEBUGREGS);
1391 #endif
1393 kvm_state->xsave = 0;
1394 #ifdef KVM_CAP_XSAVE
1395 kvm_state->xsave = kvm_check_extension(kvm_state, KVM_CAP_XSAVE);
1396 #endif
1398 kvm_state->xcrs = 0;
1399 #ifdef KVM_CAP_XCRS
1400 kvm_state->xcrs = kvm_check_extension(kvm_state, KVM_CAP_XCRS);
1401 #endif
1403 kvm_state->many_ioeventfds = kvm_check_many_ioeventfds();
1405 kvm_state->pit_state2 = 0;
1406 #ifdef KVM_CAP_PIT_STATE2
1407 kvm_state->pit_state2 = kvm_check_extension(kvm_state, KVM_CAP_PIT_STATE2);
1408 #endif
1410 kvm_init_ap();
1412 return 0;
1415 static void kvm_mutex_unlock(void)
1417 assert(!cpu_single_env);
1418 pthread_mutex_unlock(&qemu_mutex);
1421 static void kvm_mutex_lock(void)
1423 pthread_mutex_lock(&qemu_mutex);
1424 cpu_single_env = NULL;
1427 void qemu_mutex_unlock_iothread(void)
1429 if (kvm_enabled()) {
1430 kvm_mutex_unlock();
1434 void qemu_mutex_lock_iothread(void)
1436 if (kvm_enabled()) {
1437 kvm_mutex_lock();
1441 #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
1442 void kvm_add_ioperm_data(struct ioperm_data *data)
1444 QLIST_INSERT_HEAD(&ioperm_head, data, entries);
1447 void kvm_remove_ioperm_data(unsigned long start_port, unsigned long num)
1449 struct ioperm_data *data;
1451 data = QLIST_FIRST(&ioperm_head);
1452 while (data) {
1453 struct ioperm_data *next = QLIST_NEXT(data, entries);
1455 if (data->start_port == start_port && data->num == num) {
1456 QLIST_REMOVE(data, entries);
1457 qemu_free(data);
1460 data = next;
1464 void kvm_ioperm(CPUState *env, void *data)
1466 if (kvm_enabled() && qemu_system_ready) {
1467 on_vcpu(env, kvm_arch_do_ioperm, data);
1471 #endif
1473 int kvm_set_boot_cpu_id(KVMState *s, uint32_t id)
1475 #ifdef KVM_CAP_SET_BOOT_CPU_ID
1476 int r = kvm_ioctl(s, KVM_CHECK_EXTENSION, KVM_CAP_SET_BOOT_CPU_ID);
1477 if (r > 0) {
1478 return kvm_vm_ioctl(s, KVM_SET_BOOT_CPU_ID, id);
1480 #endif
1481 return -ENOSYS;