qemu-kvm: Drop kvm_inject_interrupt
[qemu-kvm.git] / qemu-kvm.c
blob1f9ddeac8bb887ca62bda1dd7457fad2ca41664c
1 /*
2 * qemu/kvm integration
4 * Copyright (C) 2006-2008 Qumranet Technologies
6 * Licensed under the terms of the GNU GPL version 2 or higher.
7 */
8 #include "config.h"
9 #include "config-host.h"
11 #include <assert.h>
12 #include <string.h>
13 #include "hw/hw.h"
14 #include "sysemu.h"
15 #include "qemu-common.h"
16 #include "console.h"
17 #include "block.h"
18 #include "compatfd.h"
19 #include "gdbstub.h"
20 #include "monitor.h"
21 #include "cpus.h"
22 #include "qemu-thread.h"
24 #include "qemu-kvm.h"
26 #include <pthread.h>
27 #include <sys/utsname.h>
28 #include <sys/syscall.h>
29 #include <sys/mman.h>
30 #include <sys/ioctl.h>
31 #include "compatfd.h"
32 #include <sys/prctl.h>
34 #ifndef PR_MCE_KILL
35 #define PR_MCE_KILL 33
36 #endif
38 #ifndef BUS_MCEERR_AR
39 #define BUS_MCEERR_AR 4
40 #endif
41 #ifndef BUS_MCEERR_AO
42 #define BUS_MCEERR_AO 5
43 #endif
45 #define EXPECTED_KVM_API_VERSION 12
47 #if EXPECTED_KVM_API_VERSION != KVM_API_VERSION
48 #error libkvm: userspace and kernel version mismatch
49 #endif
51 int kvm_irqchip = 1;
52 int kvm_pit = 1;
53 int kvm_pit_reinject = 1;
54 int kvm_nested = 0;
57 KVMState *kvm_state;
58 kvm_context_t kvm_context;
60 pthread_mutex_t qemu_mutex = PTHREAD_MUTEX_INITIALIZER;
61 pthread_cond_t qemu_vcpu_cond = PTHREAD_COND_INITIALIZER;
62 pthread_cond_t qemu_system_cond = PTHREAD_COND_INITIALIZER;
63 pthread_cond_t qemu_pause_cond = PTHREAD_COND_INITIALIZER;
64 pthread_cond_t qemu_work_cond = PTHREAD_COND_INITIALIZER;
65 __thread CPUState *current_env;
67 static int qemu_system_ready;
69 #define SIG_IPI (SIGRTMIN+4)
71 CPUState *kvm_debug_cpu_requested;
73 #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
74 /* The list of ioperm_data */
75 static QLIST_HEAD(, ioperm_data) ioperm_head;
76 #endif
78 #define ALIGN(x, y) (((x)+(y)-1) & ~((y)-1))
80 static int handle_unhandled(uint64_t reason)
82 fprintf(stderr, "kvm: unhandled exit %" PRIx64 "\n", reason);
83 return -EINVAL;
86 #define VMX_INVALID_GUEST_STATE 0x80000021
88 static int handle_failed_vmentry(uint64_t reason)
90 fprintf(stderr, "kvm: vm entry failed with error 0x%" PRIx64 "\n\n", reason);
92 /* Perhaps we will need to check if this machine is intel since exit reason 0x21
93 has a different interpretation on SVM */
94 if (reason == VMX_INVALID_GUEST_STATE) {
95 fprintf(stderr, "If you're runnning a guest on an Intel machine without\n");
96 fprintf(stderr, "unrestricted mode support, the failure can be most likely\n");
97 fprintf(stderr, "due to the guest entering an invalid state for Intel VT.\n");
98 fprintf(stderr, "For example, the guest maybe running in big real mode\n");
99 fprintf(stderr, "which is not supported on less recent Intel processors.\n\n");
102 return -EINVAL;
105 static inline void set_gsi(kvm_context_t kvm, unsigned int gsi)
107 uint32_t *bitmap = kvm->used_gsi_bitmap;
109 if (gsi < kvm->max_gsi)
110 bitmap[gsi / 32] |= 1U << (gsi % 32);
111 else
112 DPRINTF("Invalid GSI %u\n", gsi);
115 static inline void clear_gsi(kvm_context_t kvm, unsigned int gsi)
117 uint32_t *bitmap = kvm->used_gsi_bitmap;
119 if (gsi < kvm->max_gsi)
120 bitmap[gsi / 32] &= ~(1U << (gsi % 32));
121 else
122 DPRINTF("Invalid GSI %u\n", gsi);
125 static int kvm_create_context(void);
127 int kvm_init(void)
129 int fd;
130 int r, gsi_count, i;
133 fd = open("/dev/kvm", O_RDWR);
134 if (fd == -1) {
135 perror("open /dev/kvm");
136 return -1;
138 r = ioctl(fd, KVM_GET_API_VERSION, 0);
139 if (r == -1) {
140 fprintf(stderr,
141 "kvm kernel version too old: "
142 "KVM_GET_API_VERSION ioctl not supported\n");
143 goto out_close;
145 if (r < EXPECTED_KVM_API_VERSION) {
146 fprintf(stderr, "kvm kernel version too old: "
147 "We expect API version %d or newer, but got "
148 "version %d\n", EXPECTED_KVM_API_VERSION, r);
149 goto out_close;
151 if (r > EXPECTED_KVM_API_VERSION) {
152 fprintf(stderr, "kvm userspace version too old\n");
153 goto out_close;
155 kvm_state = qemu_mallocz(sizeof(*kvm_state));
156 kvm_context = &kvm_state->kvm_context;
158 kvm_state->fd = fd;
159 kvm_state->vmfd = -1;
161 #ifdef KVM_CAP_SET_GUEST_DEBUG
162 QTAILQ_INIT(&kvm_state->kvm_sw_breakpoints);
163 #endif
165 for (i = 0; i < ARRAY_SIZE(kvm_state->slots); i++) {
166 kvm_state->slots[i].slot = i;
169 #ifdef KVM_CAP_USER_MEMORY
170 r = kvm_ioctl(kvm_state, KVM_CHECK_EXTENSION, KVM_CAP_USER_MEMORY);
171 if (r <= 0) {
172 fprintf(stderr,
173 "Hypervisor too old: KVM_CAP_USER_MEMORY extension not supported\n");
174 goto out_close;
176 #else
177 #error Hypervisor too old: KVM_CAP_USER_MEMORY extension not supported
178 #endif
180 gsi_count = kvm_get_gsi_count(kvm_context);
181 if (gsi_count > 0) {
182 int gsi_bits, i;
184 /* Round up so we can search ints using ffs */
185 gsi_bits = ALIGN(gsi_count, 32);
186 kvm_context->used_gsi_bitmap = qemu_mallocz(gsi_bits / 8);
187 kvm_context->max_gsi = gsi_bits;
189 /* Mark any over-allocated bits as already in use */
190 for (i = gsi_count; i < gsi_bits; i++) {
191 set_gsi(kvm_context, i);
195 cpu_register_phys_memory_client(&kvm_cpu_phys_memory_client);
197 pthread_mutex_lock(&qemu_mutex);
198 return kvm_create_context();
200 out_close:
201 close(fd);
202 return -1;
205 static void kvm_finalize(KVMState *s)
207 /* FIXME
208 if (kvm->vcpu_fd[0] != -1)
209 close(kvm->vcpu_fd[0]);
210 if (kvm->vm_fd != -1)
211 close(kvm->vm_fd);
213 close(s->fd);
214 free(s);
217 static int kvm_set_boot_vcpu_id(kvm_context_t kvm, uint32_t id)
219 #ifdef KVM_CAP_SET_BOOT_CPU_ID
220 int r = kvm_ioctl(kvm_state, KVM_CHECK_EXTENSION, KVM_CAP_SET_BOOT_CPU_ID);
221 if (r > 0) {
222 return kvm_vm_ioctl(kvm_state, KVM_SET_BOOT_CPU_ID, id);
224 return -ENOSYS;
225 #else
226 return -ENOSYS;
227 #endif
230 void kvm_create_irqchip(kvm_context_t kvm)
232 int r;
234 #ifdef KVM_CAP_IRQCHIP
235 if (kvm_irqchip) {
236 r = kvm_ioctl(kvm_state, KVM_CHECK_EXTENSION, KVM_CAP_IRQCHIP);
237 if (r > 0) { /* kernel irqchip supported */
238 r = kvm_vm_ioctl(kvm_state, KVM_CREATE_IRQCHIP);
239 if (r >= 0) {
240 kvm->irqchip_inject_ioctl = KVM_IRQ_LINE;
241 #if defined(KVM_CAP_IRQ_INJECT_STATUS) && defined(KVM_IRQ_LINE_STATUS)
242 r = kvm_ioctl(kvm_state, KVM_CHECK_EXTENSION,
243 KVM_CAP_IRQ_INJECT_STATUS);
244 if (r > 0) {
245 kvm->irqchip_inject_ioctl = KVM_IRQ_LINE_STATUS;
247 #endif
248 kvm_state->irqchip_in_kernel = 1;
249 } else
250 fprintf(stderr, "Create kernel PIC irqchip failed\n");
253 #endif
256 #ifdef KVM_CAP_IRQCHIP
258 int kvm_set_irq_level(kvm_context_t kvm, int irq, int level, int *status)
260 struct kvm_irq_level event;
261 int r;
263 if (!kvm_state->irqchip_in_kernel) {
264 return 0;
266 event.level = level;
267 event.irq = irq;
268 r = kvm_vm_ioctl(kvm_state, kvm->irqchip_inject_ioctl, &event);
269 if (r < 0) {
270 perror("kvm_set_irq_level");
273 if (status) {
274 #ifdef KVM_CAP_IRQ_INJECT_STATUS
275 *status =
276 (kvm->irqchip_inject_ioctl == KVM_IRQ_LINE) ? 1 : event.status;
277 #else
278 *status = 1;
279 #endif
282 return 1;
285 int kvm_get_irqchip(kvm_context_t kvm, struct kvm_irqchip *chip)
287 int r;
289 if (!kvm_state->irqchip_in_kernel) {
290 return 0;
292 r = kvm_vm_ioctl(kvm_state, KVM_GET_IRQCHIP, chip);
293 if (r < 0) {
294 perror("kvm_get_irqchip\n");
296 return r;
299 int kvm_set_irqchip(kvm_context_t kvm, struct kvm_irqchip *chip)
301 int r;
303 if (!kvm_state->irqchip_in_kernel) {
304 return 0;
306 r = kvm_vm_ioctl(kvm_state, KVM_SET_IRQCHIP, chip);
307 if (r < 0) {
308 perror("kvm_set_irqchip\n");
310 return r;
313 #endif
315 static int handle_mmio(CPUState *env)
317 unsigned long addr = env->kvm_run->mmio.phys_addr;
318 struct kvm_run *kvm_run = env->kvm_run;
319 void *data = kvm_run->mmio.data;
321 /* hack: Red Hat 7.1 generates these weird accesses. */
322 if ((addr > 0xa0000 - 4 && addr <= 0xa0000) && kvm_run->mmio.len == 3) {
323 return 0;
326 cpu_physical_memory_rw(addr, data, kvm_run->mmio.len, kvm_run->mmio.is_write);
327 return 0;
330 int handle_io_window(kvm_context_t kvm)
332 return 1;
335 int handle_shutdown(kvm_context_t kvm, CPUState *env)
337 /* stop the current vcpu from going back to guest mode */
338 env->stopped = 1;
340 qemu_system_reset_request();
341 return 1;
344 static inline void push_nmi(kvm_context_t kvm)
346 #ifdef KVM_CAP_USER_NMI
347 kvm_arch_push_nmi();
348 #endif /* KVM_CAP_USER_NMI */
351 void post_kvm_run(kvm_context_t kvm, CPUState *env)
353 pthread_mutex_lock(&qemu_mutex);
354 kvm_arch_post_run(env, env->kvm_run);
355 cpu_single_env = env;
358 int pre_kvm_run(kvm_context_t kvm, CPUState *env)
360 kvm_arch_pre_run(env, env->kvm_run);
362 pthread_mutex_unlock(&qemu_mutex);
363 return 0;
366 int kvm_is_ready_for_interrupt_injection(CPUState *env)
368 return env->kvm_run->ready_for_interrupt_injection;
371 int kvm_run(CPUState *env)
373 int r;
374 kvm_context_t kvm = &env->kvm_state->kvm_context;
375 struct kvm_run *run = env->kvm_run;
376 int fd = env->kvm_fd;
378 again:
379 if (env->kvm_vcpu_dirty) {
380 kvm_arch_put_registers(env, KVM_PUT_RUNTIME_STATE);
381 env->kvm_vcpu_dirty = 0;
383 push_nmi(kvm);
384 #if !defined(__s390__)
385 if (!kvm_state->irqchip_in_kernel) {
386 run->request_interrupt_window = kvm_arch_try_push_interrupts(env);
388 #endif
390 r = pre_kvm_run(kvm, env);
391 if (r) {
392 return r;
394 if (env->exit_request) {
395 env->exit_request = 0;
396 pthread_kill(env->thread->thread, SIG_IPI);
398 r = ioctl(fd, KVM_RUN, 0);
400 if (r == -1 && errno != EINTR && errno != EAGAIN) {
401 r = -errno;
402 post_kvm_run(kvm, env);
403 fprintf(stderr, "kvm_run: %s\n", strerror(-r));
404 return r;
407 post_kvm_run(kvm, env);
409 kvm_flush_coalesced_mmio_buffer();
411 #if !defined(__s390__)
412 if (r == -1) {
413 r = handle_io_window(kvm);
414 goto more;
416 #endif
417 if (1) {
418 switch (run->exit_reason) {
419 case KVM_EXIT_UNKNOWN:
420 r = handle_unhandled(run->hw.hardware_exit_reason);
421 break;
422 case KVM_EXIT_FAIL_ENTRY:
423 r = handle_failed_vmentry(run->fail_entry.hardware_entry_failure_reason);
424 break;
425 case KVM_EXIT_EXCEPTION:
426 fprintf(stderr, "exception %d (%x)\n", run->ex.exception,
427 run->ex.error_code);
428 cpu_dump_state(env, stderr, fprintf, CPU_DUMP_CODE);
429 abort();
430 break;
431 case KVM_EXIT_IO:
432 kvm_handle_io(run->io.port,
433 (uint8_t *)run + run->io.data_offset,
434 run->io.direction,
435 run->io.size,
436 run->io.count);
437 r = 0;
438 break;
439 case KVM_EXIT_MMIO:
440 r = handle_mmio(env);
441 break;
442 case KVM_EXIT_HLT:
443 r = kvm_arch_halt(env);
444 break;
445 case KVM_EXIT_IRQ_WINDOW_OPEN:
446 break;
447 case KVM_EXIT_SHUTDOWN:
448 r = handle_shutdown(kvm, env);
449 break;
450 #if defined(__s390__)
451 case KVM_EXIT_S390_SIEIC:
452 r = kvm_s390_handle_intercept(kvm, env, run);
453 break;
454 case KVM_EXIT_S390_RESET:
455 r = kvm_s390_handle_reset(kvm, env, run);
456 break;
457 #endif
458 case KVM_EXIT_INTERNAL_ERROR:
459 r = kvm_handle_internal_error(env, run);
460 break;
461 default:
462 r = kvm_arch_run(env);
463 if (r < 0) {
464 fprintf(stderr, "unhandled vm exit: 0x%x\n", run->exit_reason);
465 cpu_dump_state(env, stderr, fprintf, CPU_DUMP_CODE);
466 abort();
468 if (r > 0) {
469 return r;
471 break;
474 more:
475 if (!r) {
476 goto again;
478 return r;
481 int kvm_inject_irq(CPUState *env, unsigned irq)
483 struct kvm_interrupt intr;
485 intr.irq = irq;
486 return kvm_vcpu_ioctl(env, KVM_INTERRUPT, &intr);
489 int kvm_inject_nmi(CPUState *env)
491 #ifdef KVM_CAP_USER_NMI
492 return kvm_vcpu_ioctl(env, KVM_NMI);
493 #else
494 return -ENOSYS;
495 #endif
498 #ifdef KVM_CAP_DEVICE_ASSIGNMENT
499 int kvm_assign_pci_device(kvm_context_t kvm,
500 struct kvm_assigned_pci_dev *assigned_dev)
502 return kvm_vm_ioctl(kvm_state, KVM_ASSIGN_PCI_DEVICE, assigned_dev);
505 static int kvm_old_assign_irq(kvm_context_t kvm,
506 struct kvm_assigned_irq *assigned_irq)
508 return kvm_vm_ioctl(kvm_state, KVM_ASSIGN_IRQ, assigned_irq);
511 #ifdef KVM_CAP_ASSIGN_DEV_IRQ
512 int kvm_assign_irq(kvm_context_t kvm, struct kvm_assigned_irq *assigned_irq)
514 int ret;
516 ret = kvm_ioctl(kvm_state, KVM_CHECK_EXTENSION, KVM_CAP_ASSIGN_DEV_IRQ);
517 if (ret > 0) {
518 return kvm_vm_ioctl(kvm_state, KVM_ASSIGN_DEV_IRQ, assigned_irq);
521 return kvm_old_assign_irq(kvm, assigned_irq);
524 int kvm_deassign_irq(kvm_context_t kvm, struct kvm_assigned_irq *assigned_irq)
526 return kvm_vm_ioctl(kvm_state, KVM_DEASSIGN_DEV_IRQ, assigned_irq);
528 #else
529 int kvm_assign_irq(kvm_context_t kvm, struct kvm_assigned_irq *assigned_irq)
531 return kvm_old_assign_irq(kvm, assigned_irq);
533 #endif
534 #endif
536 #ifdef KVM_CAP_DEVICE_DEASSIGNMENT
537 int kvm_deassign_pci_device(kvm_context_t kvm,
538 struct kvm_assigned_pci_dev *assigned_dev)
540 return kvm_vm_ioctl(kvm_state, KVM_DEASSIGN_PCI_DEVICE, assigned_dev);
542 #endif
544 int kvm_reinject_control(kvm_context_t kvm, int pit_reinject)
546 #ifdef KVM_CAP_REINJECT_CONTROL
547 int r;
548 struct kvm_reinject_control control;
550 control.pit_reinject = pit_reinject;
552 r = kvm_ioctl(kvm_state, KVM_CHECK_EXTENSION, KVM_CAP_REINJECT_CONTROL);
553 if (r > 0) {
554 return kvm_vm_ioctl(kvm_state, KVM_REINJECT_CONTROL, &control);
556 #endif
557 return -ENOSYS;
560 int kvm_has_gsi_routing(void)
562 int r = 0;
564 #ifdef KVM_CAP_IRQ_ROUTING
565 r = kvm_check_extension(kvm_state, KVM_CAP_IRQ_ROUTING);
566 #endif
567 return r;
570 int kvm_get_gsi_count(kvm_context_t kvm)
572 #ifdef KVM_CAP_IRQ_ROUTING
573 return kvm_check_extension(kvm_state, KVM_CAP_IRQ_ROUTING);
574 #else
575 return -EINVAL;
576 #endif
579 int kvm_clear_gsi_routes(void)
581 #ifdef KVM_CAP_IRQ_ROUTING
582 kvm_context_t kvm = kvm_context;
584 kvm->irq_routes->nr = 0;
585 return 0;
586 #else
587 return -EINVAL;
588 #endif
591 int kvm_add_routing_entry(struct kvm_irq_routing_entry *entry)
593 #ifdef KVM_CAP_IRQ_ROUTING
594 kvm_context_t kvm = kvm_context;
595 struct kvm_irq_routing *z;
596 struct kvm_irq_routing_entry *new;
597 int n, size;
599 if (kvm->irq_routes->nr == kvm->nr_allocated_irq_routes) {
600 n = kvm->nr_allocated_irq_routes * 2;
601 if (n < 64) {
602 n = 64;
604 size = sizeof(struct kvm_irq_routing);
605 size += n * sizeof(*new);
606 z = realloc(kvm->irq_routes, size);
607 if (!z) {
608 return -ENOMEM;
610 kvm->nr_allocated_irq_routes = n;
611 kvm->irq_routes = z;
613 n = kvm->irq_routes->nr++;
614 new = &kvm->irq_routes->entries[n];
615 memset(new, 0, sizeof(*new));
616 new->gsi = entry->gsi;
617 new->type = entry->type;
618 new->flags = entry->flags;
619 new->u = entry->u;
621 set_gsi(kvm, entry->gsi);
623 return 0;
624 #else
625 return -ENOSYS;
626 #endif
629 int kvm_add_irq_route(int gsi, int irqchip, int pin)
631 #ifdef KVM_CAP_IRQ_ROUTING
632 struct kvm_irq_routing_entry e;
634 e.gsi = gsi;
635 e.type = KVM_IRQ_ROUTING_IRQCHIP;
636 e.flags = 0;
637 e.u.irqchip.irqchip = irqchip;
638 e.u.irqchip.pin = pin;
639 return kvm_add_routing_entry(&e);
640 #else
641 return -ENOSYS;
642 #endif
645 int kvm_del_routing_entry(struct kvm_irq_routing_entry *entry)
647 #ifdef KVM_CAP_IRQ_ROUTING
648 kvm_context_t kvm = kvm_context;
649 struct kvm_irq_routing_entry *e, *p;
650 int i, gsi, found = 0;
652 gsi = entry->gsi;
654 for (i = 0; i < kvm->irq_routes->nr; ++i) {
655 e = &kvm->irq_routes->entries[i];
656 if (e->type == entry->type && e->gsi == gsi) {
657 switch (e->type) {
658 case KVM_IRQ_ROUTING_IRQCHIP:{
659 if (e->u.irqchip.irqchip ==
660 entry->u.irqchip.irqchip
661 && e->u.irqchip.pin == entry->u.irqchip.pin) {
662 p = &kvm->irq_routes->entries[--kvm->irq_routes->nr];
663 *e = *p;
664 found = 1;
666 break;
668 case KVM_IRQ_ROUTING_MSI:{
669 if (e->u.msi.address_lo ==
670 entry->u.msi.address_lo
671 && e->u.msi.address_hi ==
672 entry->u.msi.address_hi
673 && e->u.msi.data == entry->u.msi.data) {
674 p = &kvm->irq_routes->entries[--kvm->irq_routes->nr];
675 *e = *p;
676 found = 1;
678 break;
680 default:
681 break;
683 if (found) {
684 /* If there are no other users of this GSI
685 * mark it available in the bitmap */
686 for (i = 0; i < kvm->irq_routes->nr; i++) {
687 e = &kvm->irq_routes->entries[i];
688 if (e->gsi == gsi)
689 break;
691 if (i == kvm->irq_routes->nr) {
692 clear_gsi(kvm, gsi);
695 return 0;
699 return -ESRCH;
700 #else
701 return -ENOSYS;
702 #endif
705 int kvm_update_routing_entry(struct kvm_irq_routing_entry *entry,
706 struct kvm_irq_routing_entry *newentry)
708 #ifdef KVM_CAP_IRQ_ROUTING
709 kvm_context_t kvm = kvm_context;
710 struct kvm_irq_routing_entry *e;
711 int i;
713 if (entry->gsi != newentry->gsi || entry->type != newentry->type) {
714 return -EINVAL;
717 for (i = 0; i < kvm->irq_routes->nr; ++i) {
718 e = &kvm->irq_routes->entries[i];
719 if (e->type != entry->type || e->gsi != entry->gsi) {
720 continue;
722 switch (e->type) {
723 case KVM_IRQ_ROUTING_IRQCHIP:
724 if (e->u.irqchip.irqchip == entry->u.irqchip.irqchip &&
725 e->u.irqchip.pin == entry->u.irqchip.pin) {
726 memcpy(&e->u.irqchip, &newentry->u.irqchip,
727 sizeof e->u.irqchip);
728 return 0;
730 break;
731 case KVM_IRQ_ROUTING_MSI:
732 if (e->u.msi.address_lo == entry->u.msi.address_lo &&
733 e->u.msi.address_hi == entry->u.msi.address_hi &&
734 e->u.msi.data == entry->u.msi.data) {
735 memcpy(&e->u.msi, &newentry->u.msi, sizeof e->u.msi);
736 return 0;
738 break;
739 default:
740 break;
743 return -ESRCH;
744 #else
745 return -ENOSYS;
746 #endif
749 int kvm_del_irq_route(int gsi, int irqchip, int pin)
751 #ifdef KVM_CAP_IRQ_ROUTING
752 struct kvm_irq_routing_entry e;
754 e.gsi = gsi;
755 e.type = KVM_IRQ_ROUTING_IRQCHIP;
756 e.flags = 0;
757 e.u.irqchip.irqchip = irqchip;
758 e.u.irqchip.pin = pin;
759 return kvm_del_routing_entry(&e);
760 #else
761 return -ENOSYS;
762 #endif
765 int kvm_commit_irq_routes(void)
767 #ifdef KVM_CAP_IRQ_ROUTING
768 kvm_context_t kvm = kvm_context;
770 kvm->irq_routes->flags = 0;
771 return kvm_vm_ioctl(kvm_state, KVM_SET_GSI_ROUTING, kvm->irq_routes);
772 #else
773 return -ENOSYS;
774 #endif
777 int kvm_get_irq_route_gsi(void)
779 kvm_context_t kvm = kvm_context;
780 int i, bit;
781 uint32_t *buf = kvm->used_gsi_bitmap;
783 /* Return the lowest unused GSI in the bitmap */
784 for (i = 0; i < kvm->max_gsi / 32; i++) {
785 bit = ffs(~buf[i]);
786 if (!bit) {
787 continue;
790 return bit - 1 + i * 32;
793 return -ENOSPC;
796 static void kvm_msi_routing_entry(struct kvm_irq_routing_entry *e,
797 KVMMsiMessage *msg)
800 e->gsi = msg->gsi;
801 e->type = KVM_IRQ_ROUTING_MSI;
802 e->flags = 0;
803 e->u.msi.address_lo = msg->addr_lo;
804 e->u.msi.address_hi = msg->addr_hi;
805 e->u.msi.data = msg->data;
808 int kvm_msi_message_add(KVMMsiMessage *msg)
810 struct kvm_irq_routing_entry e;
811 int ret;
813 ret = kvm_get_irq_route_gsi();
814 if (ret < 0) {
815 return ret;
817 msg->gsi = ret;
819 kvm_msi_routing_entry(&e, msg);
820 return kvm_add_routing_entry(&e);
823 int kvm_msi_message_del(KVMMsiMessage *msg)
825 struct kvm_irq_routing_entry e;
827 kvm_msi_routing_entry(&e, msg);
828 return kvm_del_routing_entry(&e);
831 int kvm_msi_message_update(KVMMsiMessage *old, KVMMsiMessage *new)
833 struct kvm_irq_routing_entry e1, e2;
834 int ret;
836 new->gsi = old->gsi;
837 if (memcmp(old, new, sizeof(KVMMsiMessage)) == 0) {
838 return 0;
841 kvm_msi_routing_entry(&e1, old);
842 kvm_msi_routing_entry(&e2, new);
844 ret = kvm_update_routing_entry(&e1, &e2);
845 if (ret < 0) {
846 return ret;
849 return 1;
853 #ifdef KVM_CAP_DEVICE_MSIX
854 int kvm_assign_set_msix_nr(kvm_context_t kvm,
855 struct kvm_assigned_msix_nr *msix_nr)
857 return kvm_vm_ioctl(kvm_state, KVM_ASSIGN_SET_MSIX_NR, msix_nr);
860 int kvm_assign_set_msix_entry(kvm_context_t kvm,
861 struct kvm_assigned_msix_entry *entry)
863 return kvm_vm_ioctl(kvm_state, KVM_ASSIGN_SET_MSIX_ENTRY, entry);
865 #endif
867 unsigned long kvm_get_thread_id(void)
869 return syscall(SYS_gettid);
872 static void kvm_cond_wait(pthread_cond_t *cond)
874 CPUState *env = cpu_single_env;
876 pthread_cond_wait(cond, &qemu_mutex);
877 cpu_single_env = env;
880 static void sig_ipi_handler(int n)
884 static void sigbus_reraise(void)
886 sigset_t set;
887 struct sigaction action;
889 memset(&action, 0, sizeof(action));
890 action.sa_handler = SIG_DFL;
891 if (!sigaction(SIGBUS, &action, NULL)) {
892 raise(SIGBUS);
893 sigemptyset(&set);
894 sigaddset(&set, SIGBUS);
895 sigprocmask(SIG_UNBLOCK, &set, NULL);
897 perror("Failed to re-raise SIGBUS!\n");
898 abort();
901 static void sigbus_handler(int n, struct qemu_signalfd_siginfo *siginfo,
902 void *ctx)
904 if (kvm_on_sigbus(siginfo->ssi_code, (void *)(intptr_t)siginfo->ssi_addr))
905 sigbus_reraise();
908 void on_vcpu(CPUState *env, void (*func)(void *data), void *data)
910 struct qemu_work_item wi;
912 if (env == current_env) {
913 func(data);
914 return;
917 wi.func = func;
918 wi.data = data;
919 if (!env->kvm_cpu_state.queued_work_first) {
920 env->kvm_cpu_state.queued_work_first = &wi;
921 } else {
922 env->kvm_cpu_state.queued_work_last->next = &wi;
924 env->kvm_cpu_state.queued_work_last = &wi;
925 wi.next = NULL;
926 wi.done = false;
928 pthread_kill(env->thread->thread, SIG_IPI);
929 while (!wi.done) {
930 kvm_cond_wait(&qemu_work_cond);
934 void kvm_update_interrupt_request(CPUState *env)
936 int signal = 0;
938 if (env) {
939 if (!current_env || !current_env->created) {
940 signal = 1;
943 * Testing for created here is really redundant
945 if (current_env && current_env->created &&
946 env != current_env && !env->kvm_cpu_state.signalled) {
947 signal = 1;
950 if (signal) {
951 env->kvm_cpu_state.signalled = 1;
952 if (env->thread) {
953 pthread_kill(env->thread->thread, SIG_IPI);
959 int kvm_cpu_exec(CPUState *env)
961 int r;
963 r = kvm_run(env);
964 if (r < 0) {
965 printf("kvm_run returned %d\n", r);
966 cpu_dump_state(env, stderr, fprintf, CPU_DUMP_CODE);
967 vm_stop(VMSTOP_PANIC);
970 return 0;
973 int kvm_cpu_is_stopped(CPUState *env)
975 return !vm_running || env->stopped;
978 static void flush_queued_work(CPUState *env)
980 struct qemu_work_item *wi;
982 if (!env->kvm_cpu_state.queued_work_first) {
983 return;
986 while ((wi = env->kvm_cpu_state.queued_work_first)) {
987 env->kvm_cpu_state.queued_work_first = wi->next;
988 wi->func(wi->data);
989 wi->done = true;
991 env->kvm_cpu_state.queued_work_last = NULL;
992 pthread_cond_broadcast(&qemu_work_cond);
995 static void kvm_main_loop_wait(CPUState *env, int timeout)
997 struct timespec ts;
998 int r, e;
999 siginfo_t siginfo;
1000 sigset_t waitset;
1001 sigset_t chkset;
1003 ts.tv_sec = timeout / 1000;
1004 ts.tv_nsec = (timeout % 1000) * 1000000;
1005 sigemptyset(&waitset);
1006 sigaddset(&waitset, SIG_IPI);
1007 sigaddset(&waitset, SIGBUS);
1009 do {
1010 pthread_mutex_unlock(&qemu_mutex);
1012 r = sigtimedwait(&waitset, &siginfo, &ts);
1013 e = errno;
1015 pthread_mutex_lock(&qemu_mutex);
1017 if (r == -1 && !(e == EAGAIN || e == EINTR)) {
1018 printf("sigtimedwait: %s\n", strerror(e));
1019 exit(1);
1022 switch (r) {
1023 case SIGBUS:
1024 if (kvm_on_sigbus_vcpu(env, siginfo.si_code, siginfo.si_addr))
1025 sigbus_reraise();
1026 break;
1027 default:
1028 break;
1031 r = sigpending(&chkset);
1032 if (r == -1) {
1033 printf("sigpending: %s\n", strerror(e));
1034 exit(1);
1036 } while (sigismember(&chkset, SIG_IPI) || sigismember(&chkset, SIGBUS));
1038 cpu_single_env = env;
1039 flush_queued_work(env);
1041 if (env->stop) {
1042 env->stop = 0;
1043 env->stopped = 1;
1044 pthread_cond_signal(&qemu_pause_cond);
1047 env->kvm_cpu_state.signalled = 0;
1050 static int all_threads_paused(void)
1052 CPUState *penv = first_cpu;
1054 while (penv) {
1055 if (penv->stop) {
1056 return 0;
1058 penv = (CPUState *) penv->next_cpu;
1061 return 1;
1064 static void pause_all_threads(void)
1066 CPUState *penv = first_cpu;
1068 while (penv) {
1069 if (penv != cpu_single_env) {
1070 penv->stop = 1;
1071 pthread_kill(penv->thread->thread, SIG_IPI);
1072 } else {
1073 penv->stop = 0;
1074 penv->stopped = 1;
1075 cpu_exit(penv);
1077 penv = (CPUState *) penv->next_cpu;
1080 while (!all_threads_paused()) {
1081 kvm_cond_wait(&qemu_pause_cond);
1085 static void resume_all_threads(void)
1087 CPUState *penv = first_cpu;
1089 assert(!cpu_single_env);
1091 while (penv) {
1092 penv->stop = 0;
1093 penv->stopped = 0;
1094 pthread_kill(penv->thread->thread, SIG_IPI);
1095 penv = (CPUState *) penv->next_cpu;
1099 static void kvm_vm_state_change_handler(void *context, int running, int reason)
1101 if (running) {
1102 resume_all_threads();
1103 } else {
1104 pause_all_threads();
1108 static void setup_kernel_sigmask(CPUState *env)
1110 sigset_t set;
1112 sigemptyset(&set);
1113 sigaddset(&set, SIGUSR2);
1114 sigaddset(&set, SIGIO);
1115 sigaddset(&set, SIGALRM);
1116 sigprocmask(SIG_BLOCK, &set, NULL);
1118 sigprocmask(SIG_BLOCK, NULL, &set);
1119 sigdelset(&set, SIG_IPI);
1120 sigdelset(&set, SIGBUS);
1122 kvm_set_signal_mask(env, &set);
1125 static void qemu_kvm_system_reset(void)
1127 pause_all_threads();
1129 cpu_synchronize_all_states();
1130 qemu_system_reset();
1132 resume_all_threads();
1135 static void process_irqchip_events(CPUState *env)
1137 kvm_arch_process_irqchip_events(env);
1138 if (kvm_arch_has_work(env))
1139 env->halted = 0;
1142 static int kvm_main_loop_cpu(CPUState *env)
1144 while (1) {
1145 int run_cpu = !kvm_cpu_is_stopped(env);
1146 if (run_cpu) {
1147 kvm_arch_process_async_events(env);
1148 if (!kvm_irqchip_in_kernel()) {
1149 process_irqchip_events(env);
1150 run_cpu = !env->halted;
1153 if (run_cpu) {
1154 kvm_cpu_exec(env);
1155 kvm_main_loop_wait(env, 0);
1156 } else {
1157 kvm_main_loop_wait(env, 1000);
1160 pthread_mutex_unlock(&qemu_mutex);
1161 return 0;
1164 static void *ap_main_loop(void *_env)
1166 CPUState *env = _env;
1167 #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
1168 struct ioperm_data *data = NULL;
1169 #endif
1171 current_env = env;
1172 env->thread_id = kvm_get_thread_id();
1174 #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
1175 /* do ioperm for io ports of assigned devices */
1176 QLIST_FOREACH(data, &ioperm_head, entries)
1177 on_vcpu(env, kvm_arch_do_ioperm, data);
1178 #endif
1180 pthread_mutex_lock(&qemu_mutex);
1181 cpu_single_env = env;
1183 if (kvm_create_vcpu(env) < 0) {
1184 abort();
1186 setup_kernel_sigmask(env);
1188 /* signal VCPU creation */
1189 current_env->created = 1;
1190 pthread_cond_signal(&qemu_vcpu_cond);
1192 /* and wait for machine initialization */
1193 while (!qemu_system_ready) {
1194 kvm_cond_wait(&qemu_system_cond);
1197 /* re-initialize cpu_single_env after re-acquiring qemu_mutex */
1198 cpu_single_env = env;
1200 kvm_main_loop_cpu(env);
1201 return NULL;
1204 int kvm_init_vcpu(CPUState *env)
1206 env->thread = qemu_mallocz(sizeof(QemuThread));
1207 qemu_thread_create(env->thread, ap_main_loop, env);
1209 while (env->created == 0) {
1210 kvm_cond_wait(&qemu_vcpu_cond);
1213 return 0;
1216 #ifdef TARGET_I386
1217 void kvm_hpet_disable_kpit(void)
1219 struct kvm_pit_state2 ps2;
1221 kvm_get_pit2(kvm_context, &ps2);
1222 ps2.flags |= KVM_PIT_FLAGS_HPET_LEGACY;
1223 kvm_set_pit2(kvm_context, &ps2);
1226 void kvm_hpet_enable_kpit(void)
1228 struct kvm_pit_state2 ps2;
1230 kvm_get_pit2(kvm_context, &ps2);
1231 ps2.flags &= ~KVM_PIT_FLAGS_HPET_LEGACY;
1232 kvm_set_pit2(kvm_context, &ps2);
1234 #endif
1236 int kvm_init_ap(void)
1238 struct sigaction action;
1240 qemu_add_vm_change_state_handler(kvm_vm_state_change_handler, NULL);
1242 signal(SIG_IPI, sig_ipi_handler);
1244 memset(&action, 0, sizeof(action));
1245 action.sa_flags = SA_SIGINFO;
1246 action.sa_sigaction = (void (*)(int, siginfo_t*, void*))sigbus_handler;
1247 sigaction(SIGBUS, &action, NULL);
1248 prctl(PR_MCE_KILL, 1, 1, 0, 0);
1249 return 0;
1252 /* If we have signalfd, we mask out the signals we want to handle and then
1253 * use signalfd to listen for them. We rely on whatever the current signal
1254 * handler is to dispatch the signals when we receive them.
1257 static void sigfd_handler(void *opaque)
1259 int fd = (unsigned long) opaque;
1260 struct qemu_signalfd_siginfo info;
1261 struct sigaction action;
1262 ssize_t len;
1264 while (1) {
1265 do {
1266 len = read(fd, &info, sizeof(info));
1267 } while (len == -1 && errno == EINTR);
1269 if (len == -1 && errno == EAGAIN) {
1270 break;
1273 if (len != sizeof(info)) {
1274 printf("read from sigfd returned %zd: %m\n", len);
1275 return;
1278 sigaction(info.ssi_signo, NULL, &action);
1279 if ((action.sa_flags & SA_SIGINFO) && action.sa_sigaction) {
1280 action.sa_sigaction(info.ssi_signo,
1281 (siginfo_t *)&info, NULL);
1282 } else if (action.sa_handler) {
1283 action.sa_handler(info.ssi_signo);
1288 int kvm_main_loop(void)
1290 sigset_t mask;
1291 int sigfd;
1293 qemu_system_ready = 1;
1295 sigemptyset(&mask);
1296 sigaddset(&mask, SIGIO);
1297 sigaddset(&mask, SIGALRM);
1298 sigaddset(&mask, SIGBUS);
1299 sigprocmask(SIG_BLOCK, &mask, NULL);
1301 sigfd = qemu_signalfd(&mask);
1302 if (sigfd == -1) {
1303 fprintf(stderr, "failed to create signalfd\n");
1304 return -errno;
1307 fcntl(sigfd, F_SETFL, O_NONBLOCK);
1309 qemu_set_fd_handler2(sigfd, NULL, sigfd_handler, NULL,
1310 (void *)(unsigned long) sigfd);
1312 pthread_cond_broadcast(&qemu_system_cond);
1314 cpu_single_env = NULL;
1316 while (1) {
1317 main_loop_wait(0);
1318 if (qemu_shutdown_requested()) {
1319 monitor_protocol_event(QEVENT_SHUTDOWN, NULL);
1320 if (qemu_no_shutdown()) {
1321 vm_stop(VMSTOP_SHUTDOWN);
1322 } else {
1323 break;
1325 } else if (qemu_powerdown_requested()) {
1326 monitor_protocol_event(QEVENT_POWERDOWN, NULL);
1327 qemu_irq_raise(qemu_system_powerdown);
1328 } else if (qemu_reset_requested()) {
1329 qemu_kvm_system_reset();
1330 } else if (kvm_debug_cpu_requested) {
1331 gdb_set_stop_cpu(kvm_debug_cpu_requested);
1332 vm_stop(VMSTOP_DEBUG);
1333 kvm_debug_cpu_requested = NULL;
1337 bdrv_close_all();
1338 pause_all_threads();
1339 pthread_mutex_unlock(&qemu_mutex);
1341 return 0;
1344 #if !defined(TARGET_I386)
1345 int kvm_arch_init_irq_routing(void)
1347 return 0;
1349 #endif
1351 extern int no_hpet;
1353 static int kvm_create_context(void)
1355 static const char upgrade_note[] =
1356 "Please upgrade to at least kernel 2.6.29 or recent kvm-kmod\n"
1357 "(see http://sourceforge.net/projects/kvm).\n";
1359 int r;
1361 kvm_state->pit_in_kernel = kvm_pit;
1363 #ifdef KVM_CAP_IRQ_ROUTING
1364 kvm_context->irq_routes = qemu_mallocz(sizeof(*kvm_context->irq_routes));
1365 kvm_context->nr_allocated_irq_routes = 0;
1366 #endif
1368 kvm_state->vmfd = kvm_ioctl(kvm_state, KVM_CREATE_VM, 0);
1369 if (kvm_state->vmfd < 0) {
1370 fprintf(stderr, "kvm_create_vm: %m\n");
1371 kvm_finalize(kvm_state);
1372 return -1;
1375 r = kvm_arch_init(kvm_state);
1376 if (r < 0) {
1377 kvm_finalize(kvm_state);
1378 return r;
1381 kvm_create_irqchip(kvm_context);
1383 /* There was a nasty bug in < kvm-80 that prevents memory slots from being
1384 * destroyed properly. Since we rely on this capability, refuse to work
1385 * with any kernel without this capability. */
1386 if (!kvm_check_extension(kvm_state, KVM_CAP_DESTROY_MEMORY_REGION_WORKS)) {
1387 fprintf(stderr,
1388 "KVM kernel module broken (DESTROY_MEMORY_REGION).\n%s",
1389 upgrade_note);
1390 return -EINVAL;
1393 r = kvm_arch_init_irq_routing();
1394 if (r < 0) {
1395 return r;
1398 kvm_state->coalesced_mmio = 0;
1399 #ifdef KVM_CAP_COALESCED_MMIO
1400 kvm_state->coalesced_mmio =
1401 kvm_check_extension(kvm_state, KVM_CAP_COALESCED_MMIO);
1402 #endif
1404 kvm_state->vcpu_events = 0;
1405 #ifdef KVM_CAP_VCPU_EVENTS
1406 kvm_state->vcpu_events = kvm_check_extension(kvm_state, KVM_CAP_VCPU_EVENTS);
1407 #endif
1409 kvm_state->debugregs = 0;
1410 #ifdef KVM_CAP_DEBUGREGS
1411 kvm_state->debugregs = kvm_check_extension(kvm_state, KVM_CAP_DEBUGREGS);
1412 #endif
1414 kvm_state->xsave = 0;
1415 #ifdef KVM_CAP_XSAVE
1416 kvm_state->xsave = kvm_check_extension(kvm_state, KVM_CAP_XSAVE);
1417 #endif
1419 kvm_state->xcrs = 0;
1420 #ifdef KVM_CAP_XCRS
1421 kvm_state->xcrs = kvm_check_extension(kvm_state, KVM_CAP_XCRS);
1422 #endif
1424 kvm_state->many_ioeventfds = kvm_check_many_ioeventfds();
1426 kvm_init_ap();
1427 if (kvm_irqchip) {
1428 if (!qemu_kvm_has_gsi_routing()) {
1429 irq0override = 0;
1430 #ifdef TARGET_I386
1431 /* if kernel can't do irq routing, interrupt source
1432 * override 0->2 can not be set up as required by hpet,
1433 * so disable hpet.
1435 no_hpet = 1;
1436 } else if (!qemu_kvm_has_pit_state2()) {
1437 no_hpet = 1;
1439 #else
1441 #endif
1444 return 0;
1447 #ifdef KVM_CAP_IRQCHIP
1449 int kvm_set_irq(int irq, int level, int *status)
1451 return kvm_set_irq_level(kvm_context, irq, level, status);
1454 #endif
1456 static void kvm_mutex_unlock(void)
1458 assert(!cpu_single_env);
1459 pthread_mutex_unlock(&qemu_mutex);
1462 static void kvm_mutex_lock(void)
1464 pthread_mutex_lock(&qemu_mutex);
1465 cpu_single_env = NULL;
1468 void qemu_mutex_unlock_iothread(void)
1470 if (kvm_enabled()) {
1471 kvm_mutex_unlock();
1475 void qemu_mutex_lock_iothread(void)
1477 if (kvm_enabled()) {
1478 kvm_mutex_lock();
1482 #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
1483 void kvm_add_ioperm_data(struct ioperm_data *data)
1485 QLIST_INSERT_HEAD(&ioperm_head, data, entries);
1488 void kvm_remove_ioperm_data(unsigned long start_port, unsigned long num)
1490 struct ioperm_data *data;
1492 data = QLIST_FIRST(&ioperm_head);
1493 while (data) {
1494 struct ioperm_data *next = QLIST_NEXT(data, entries);
1496 if (data->start_port == start_port && data->num == num) {
1497 QLIST_REMOVE(data, entries);
1498 qemu_free(data);
1501 data = next;
1505 void kvm_ioperm(CPUState *env, void *data)
1507 if (kvm_enabled() && qemu_system_ready) {
1508 on_vcpu(env, kvm_arch_do_ioperm, data);
1512 #endif
1514 int kvm_set_boot_cpu_id(uint32_t id)
1516 return kvm_set_boot_vcpu_id(kvm_context, id);