4 * Copyright (C) 2006-2008 Qumranet Technologies
6 * Licensed under the terms of the GNU GPL version 2 or higher.
9 #include "config-host.h"
15 #include "qemu-common.h"
25 #include <sys/utsname.h>
26 #include <sys/syscall.h>
28 #include <sys/ioctl.h>
34 #define EXPECTED_KVM_API_VERSION 12
36 #if EXPECTED_KVM_API_VERSION != KVM_API_VERSION
37 #error libkvm: userspace and kernel version mismatch
43 int kvm_pit_reinject
= 1;
48 kvm_context_t kvm_context
;
50 pthread_mutex_t qemu_mutex
= PTHREAD_MUTEX_INITIALIZER
;
51 pthread_cond_t qemu_vcpu_cond
= PTHREAD_COND_INITIALIZER
;
52 pthread_cond_t qemu_system_cond
= PTHREAD_COND_INITIALIZER
;
53 pthread_cond_t qemu_pause_cond
= PTHREAD_COND_INITIALIZER
;
54 pthread_cond_t qemu_work_cond
= PTHREAD_COND_INITIALIZER
;
55 __thread CPUState
*current_env
;
57 static int qemu_system_ready
;
59 #define SIG_IPI (SIGRTMIN+4)
62 static int io_thread_fd
= -1;
63 static int io_thread_sigfd
= -1;
65 static CPUState
*kvm_debug_cpu_requested
;
67 static uint64_t phys_ram_size
;
69 /* The list of ioperm_data */
70 static LIST_HEAD(, ioperm_data
) ioperm_head
;
72 //#define DEBUG_MEMREG
74 #define DPRINTF(fmt, args...) \
75 do { fprintf(stderr, "%s:%d " fmt , __func__, __LINE__, ##args); } while (0)
77 #define DPRINTF(fmt, args...) do {} while (0)
80 #define ALIGN(x, y) (((x)+(y)-1) & ~((y)-1))
82 int kvm_abi
= EXPECTED_KVM_API_VERSION
;
85 #ifdef KVM_CAP_SET_GUEST_DEBUG
86 static int kvm_debug(void *opaque
, void *data
,
87 struct kvm_debug_exit_arch
*arch_info
)
89 int handle
= kvm_arch_debug(arch_info
);
93 kvm_debug_cpu_requested
= env
;
94 env
->kvm_cpu_state
.stopped
= 1;
100 static int kvm_inb(void *opaque
, uint16_t addr
, uint8_t *data
)
102 *data
= cpu_inb(0, addr
);
106 static int kvm_inw(void *opaque
, uint16_t addr
, uint16_t *data
)
108 *data
= cpu_inw(0, addr
);
112 static int kvm_inl(void *opaque
, uint16_t addr
, uint32_t *data
)
114 *data
= cpu_inl(0, addr
);
118 #define PM_IO_BASE 0xb000
120 static int kvm_outb(void *opaque
, uint16_t addr
, uint8_t data
)
125 cpu_outb(0, 0xb3, 0);
132 x
= cpu_inw(0, PM_IO_BASE
+ 4);
134 cpu_outw(0, PM_IO_BASE
+ 4, x
);
141 x
= cpu_inw(0, PM_IO_BASE
+ 4);
143 cpu_outw(0, PM_IO_BASE
+ 4, x
);
151 cpu_outb(0, addr
, data
);
155 static int kvm_outw(void *opaque
, uint16_t addr
, uint16_t data
)
157 cpu_outw(0, addr
, data
);
161 static int kvm_outl(void *opaque
, uint16_t addr
, uint32_t data
)
163 cpu_outl(0, addr
, data
);
167 int kvm_mmio_read(void *opaque
, uint64_t addr
, uint8_t *data
, int len
)
169 cpu_physical_memory_rw(addr
, data
, len
, 0);
173 int kvm_mmio_write(void *opaque
, uint64_t addr
, uint8_t *data
, int len
)
175 cpu_physical_memory_rw(addr
, data
, len
, 1);
179 static int handle_unhandled(uint64_t reason
)
181 fprintf(stderr
, "kvm: unhandled exit %"PRIx64
"\n", reason
);
186 static inline void set_gsi(kvm_context_t kvm
, unsigned int gsi
)
188 uint32_t *bitmap
= kvm
->used_gsi_bitmap
;
190 if (gsi
< kvm
->max_gsi
)
191 bitmap
[gsi
/ 32] |= 1U << (gsi
% 32);
193 DPRINTF("Invalid GSI %d\n");
196 static inline void clear_gsi(kvm_context_t kvm
, unsigned int gsi
)
198 uint32_t *bitmap
= kvm
->used_gsi_bitmap
;
200 if (gsi
< kvm
->max_gsi
)
201 bitmap
[gsi
/ 32] &= ~(1U << (gsi
% 32));
203 DPRINTF("Invalid GSI %d\n");
207 unsigned long phys_addr
;
209 unsigned long userspace_addr
;
214 struct slot_info slots
[KVM_MAX_NUM_MEM_REGIONS
];
216 static void init_slots(void)
220 for (i
= 0; i
< KVM_MAX_NUM_MEM_REGIONS
; ++i
)
224 static int get_free_slot(kvm_context_t kvm
)
229 #if defined(KVM_CAP_SET_TSS_ADDR) && !defined(__s390__)
230 tss_ext
= kvm_ioctl(kvm_state
, KVM_CHECK_EXTENSION
, KVM_CAP_SET_TSS_ADDR
);
236 * on older kernels where the set tss ioctl is not supprted we must save
237 * slot 0 to hold the extended memory, as the vmx will use the last 3
238 * pages of this slot.
245 for (; i
< KVM_MAX_NUM_MEM_REGIONS
; ++i
)
251 static void register_slot(int slot
, unsigned long phys_addr
, unsigned long len
,
252 unsigned long userspace_addr
, unsigned flags
)
254 slots
[slot
].phys_addr
= phys_addr
;
255 slots
[slot
].len
= len
;
256 slots
[slot
].userspace_addr
= userspace_addr
;
257 slots
[slot
].flags
= flags
;
260 static void free_slot(int slot
)
263 slots
[slot
].logging_count
= 0;
266 static int get_slot(unsigned long phys_addr
)
270 for (i
= 0; i
< KVM_MAX_NUM_MEM_REGIONS
; ++i
) {
271 if (slots
[i
].len
&& slots
[i
].phys_addr
<= phys_addr
&&
272 (slots
[i
].phys_addr
+ slots
[i
].len
-1) >= phys_addr
)
278 /* Returns -1 if this slot is not totally contained on any other,
279 * and the number of the slot otherwise */
280 static int get_container_slot(uint64_t phys_addr
, unsigned long size
)
284 for (i
= 0; i
< KVM_MAX_NUM_MEM_REGIONS
; ++i
)
285 if (slots
[i
].len
&& slots
[i
].phys_addr
<= phys_addr
&&
286 (slots
[i
].phys_addr
+ slots
[i
].len
) >= phys_addr
+ size
)
291 int kvm_is_containing_region(kvm_context_t kvm
, unsigned long phys_addr
, unsigned long size
)
293 int slot
= get_container_slot(phys_addr
, size
);
300 * dirty pages logging control
302 static int kvm_dirty_pages_log_change(kvm_context_t kvm
,
303 unsigned long phys_addr
,
308 int slot
= get_slot(phys_addr
);
311 fprintf(stderr
, "BUG: %s: invalid parameters\n", __FUNCTION__
);
315 flags
= (slots
[slot
].flags
& ~mask
) | flags
;
316 if (flags
== slots
[slot
].flags
)
318 slots
[slot
].flags
= flags
;
321 struct kvm_userspace_memory_region mem
= {
323 .memory_size
= slots
[slot
].len
,
324 .guest_phys_addr
= slots
[slot
].phys_addr
,
325 .userspace_addr
= slots
[slot
].userspace_addr
,
326 .flags
= slots
[slot
].flags
,
330 DPRINTF("slot %d start %llx len %llx flags %x\n",
335 r
= kvm_vm_ioctl(kvm_state
, KVM_SET_USER_MEMORY_REGION
, &mem
);
337 fprintf(stderr
, "%s: %m\n", __FUNCTION__
);
342 static int kvm_dirty_pages_log_change_all(kvm_context_t kvm
,
343 int (*change
)(kvm_context_t kvm
,
349 for (i
=r
=0; i
<KVM_MAX_NUM_MEM_REGIONS
&& r
==0; i
++) {
351 r
= change(kvm
, slots
[i
].phys_addr
, slots
[i
].len
);
356 int kvm_dirty_pages_log_enable_slot(kvm_context_t kvm
,
360 int slot
= get_slot(phys_addr
);
362 DPRINTF("start %"PRIx64
" len %"PRIx64
"\n", phys_addr
, len
);
364 fprintf(stderr
, "BUG: %s: invalid parameters\n", __func__
);
368 if (slots
[slot
].logging_count
++)
371 return kvm_dirty_pages_log_change(kvm
, slots
[slot
].phys_addr
,
372 KVM_MEM_LOG_DIRTY_PAGES
,
373 KVM_MEM_LOG_DIRTY_PAGES
);
376 int kvm_dirty_pages_log_disable_slot(kvm_context_t kvm
,
380 int slot
= get_slot(phys_addr
);
383 fprintf(stderr
, "BUG: %s: invalid parameters\n", __func__
);
387 if (--slots
[slot
].logging_count
)
390 return kvm_dirty_pages_log_change(kvm
, slots
[slot
].phys_addr
,
392 KVM_MEM_LOG_DIRTY_PAGES
);
396 * Enable dirty page logging for all memory regions
398 int kvm_dirty_pages_log_enable_all(kvm_context_t kvm
)
400 if (kvm
->dirty_pages_log_all
)
402 kvm
->dirty_pages_log_all
= 1;
403 return kvm_dirty_pages_log_change_all(kvm
,
404 kvm_dirty_pages_log_enable_slot
);
408 * Enable dirty page logging only for memory regions that were created with
409 * dirty logging enabled (disable for all other memory regions).
411 int kvm_dirty_pages_log_reset(kvm_context_t kvm
)
413 if (!kvm
->dirty_pages_log_all
)
415 kvm
->dirty_pages_log_all
= 0;
416 return kvm_dirty_pages_log_change_all(kvm
,
417 kvm_dirty_pages_log_disable_slot
);
421 int kvm_init(int smp_cpus
)
427 fd
= open("/dev/kvm", O_RDWR
);
429 perror("open /dev/kvm");
432 r
= ioctl(fd
, KVM_GET_API_VERSION
, 0);
434 fprintf(stderr
, "kvm kernel version too old: "
435 "KVM_GET_API_VERSION ioctl not supported\n");
438 if (r
< EXPECTED_KVM_API_VERSION
) {
439 fprintf(stderr
, "kvm kernel version too old: "
440 "We expect API version %d or newer, but got "
442 EXPECTED_KVM_API_VERSION
, r
);
445 if (r
> EXPECTED_KVM_API_VERSION
) {
446 fprintf(stderr
, "kvm userspace version too old\n");
450 kvm_page_size
= getpagesize();
451 kvm_state
= qemu_mallocz(sizeof(*kvm_state
));
452 kvm_context
= &kvm_state
->kvm_context
;
455 kvm_state
->vmfd
= -1;
456 kvm_context
->opaque
= cpu_single_env
;
457 kvm_context
->dirty_pages_log_all
= 0;
458 kvm_context
->no_irqchip_creation
= 0;
459 kvm_context
->no_pit_creation
= 0;
461 #ifdef KVM_CAP_SET_GUEST_DEBUG
462 TAILQ_INIT(&kvm_state
->kvm_sw_breakpoints
);
465 gsi_count
= kvm_get_gsi_count(kvm_context
);
469 /* Round up so we can search ints using ffs */
470 gsi_bits
= ALIGN(gsi_count
, 32);
471 kvm_context
->used_gsi_bitmap
= qemu_mallocz(gsi_bits
/ 8);
472 kvm_context
->max_gsi
= gsi_bits
;
474 /* Mark any over-allocated bits as already in use */
475 for (i
= gsi_count
; i
< gsi_bits
; i
++)
476 set_gsi(kvm_context
, i
);
479 pthread_mutex_lock(&qemu_mutex
);
487 static void kvm_finalize(KVMState
*s
)
490 if (kvm->vcpu_fd[0] != -1)
491 close(kvm->vcpu_fd[0]);
492 if (kvm->vm_fd != -1)
499 void kvm_disable_irqchip_creation(kvm_context_t kvm
)
501 kvm
->no_irqchip_creation
= 1;
504 void kvm_disable_pit_creation(kvm_context_t kvm
)
506 kvm
->no_pit_creation
= 1;
509 kvm_vcpu_context_t
kvm_create_vcpu(CPUState
*env
, int id
)
513 kvm_vcpu_context_t vcpu_ctx
= qemu_malloc(sizeof(struct kvm_vcpu_context
));
514 kvm_context_t kvm
= kvm_context
;
519 r
= kvm_vm_ioctl(kvm_state
, KVM_CREATE_VCPU
, id
);
521 fprintf(stderr
, "kvm_create_vcpu: %m\n");
527 env
->kvm_state
= kvm_state
;
529 mmap_size
= kvm_ioctl(kvm_state
, KVM_GET_VCPU_MMAP_SIZE
, 0);
531 fprintf(stderr
, "get vcpu mmap size: %m\n");
534 vcpu_ctx
->run
= mmap(NULL
, mmap_size
, PROT_READ
|PROT_WRITE
, MAP_SHARED
,
536 if (vcpu_ctx
->run
== MAP_FAILED
) {
537 fprintf(stderr
, "mmap vcpu area: %m\n");
548 static int kvm_set_boot_vcpu_id(kvm_context_t kvm
, uint32_t id
)
550 #ifdef KVM_CAP_SET_BOOT_CPU_ID
551 int r
= kvm_ioctl(kvm_state
, KVM_CHECK_EXTENSION
, KVM_CAP_SET_BOOT_CPU_ID
);
553 return kvm_vm_ioctl(kvm_state
, KVM_SET_BOOT_CPU_ID
, id
);
560 int kvm_create_vm(kvm_context_t kvm
)
563 #ifdef KVM_CAP_IRQ_ROUTING
564 kvm
->irq_routes
= qemu_mallocz(sizeof(*kvm
->irq_routes
));
565 kvm
->nr_allocated_irq_routes
= 0;
568 fd
= kvm_ioctl(kvm_state
, KVM_CREATE_VM
, 0);
570 fprintf(stderr
, "kvm_create_vm: %m\n");
573 kvm_state
->vmfd
= fd
;
577 static int kvm_create_default_phys_mem(kvm_context_t kvm
,
578 unsigned long phys_mem_bytes
,
581 #ifdef KVM_CAP_USER_MEMORY
582 int r
= kvm_ioctl(kvm_state
, KVM_CHECK_EXTENSION
, KVM_CAP_USER_MEMORY
);
585 fprintf(stderr
, "Hypervisor too old: KVM_CAP_USER_MEMORY extension not supported\n");
587 #error Hypervisor too old: KVM_CAP_USER_MEMORY extension not supported
592 int kvm_check_extension(kvm_context_t kvm
, int ext
)
596 ret
= kvm_ioctl(kvm_state
, KVM_CHECK_EXTENSION
, ext
);
602 void kvm_create_irqchip(kvm_context_t kvm
)
606 kvm
->irqchip_in_kernel
= 0;
607 #ifdef KVM_CAP_IRQCHIP
608 if (!kvm
->no_irqchip_creation
) {
609 r
= kvm_ioctl(kvm_state
, KVM_CHECK_EXTENSION
, KVM_CAP_IRQCHIP
);
610 if (r
> 0) { /* kernel irqchip supported */
611 r
= kvm_vm_ioctl(kvm_state
, KVM_CREATE_IRQCHIP
);
613 kvm
->irqchip_inject_ioctl
= KVM_IRQ_LINE
;
614 #if defined(KVM_CAP_IRQ_INJECT_STATUS) && defined(KVM_IRQ_LINE_STATUS)
615 r
= kvm_ioctl(kvm_state
, KVM_CHECK_EXTENSION
,
616 KVM_CAP_IRQ_INJECT_STATUS
);
618 kvm
->irqchip_inject_ioctl
= KVM_IRQ_LINE_STATUS
;
620 kvm
->irqchip_in_kernel
= 1;
623 fprintf(stderr
, "Create kernel PIC irqchip failed\n");
629 int kvm_create(kvm_context_t kvm
, unsigned long phys_mem_bytes
, void **vm_mem
)
633 r
= kvm_create_vm(kvm
);
636 r
= kvm_arch_create(kvm
, phys_mem_bytes
, vm_mem
);
640 r
= kvm_create_default_phys_mem(kvm
, phys_mem_bytes
, vm_mem
);
643 kvm_create_irqchip(kvm
);
649 int kvm_register_phys_mem(kvm_context_t kvm
,
650 unsigned long phys_start
, void *userspace_addr
,
651 unsigned long len
, int log
)
654 struct kvm_userspace_memory_region memory
= {
656 .guest_phys_addr
= phys_start
,
657 .userspace_addr
= (unsigned long)(intptr_t)userspace_addr
,
658 .flags
= log
? KVM_MEM_LOG_DIRTY_PAGES
: 0,
662 memory
.slot
= get_free_slot(kvm
);
663 DPRINTF("memory: gpa: %llx, size: %llx, uaddr: %llx, slot: %x, flags: %lx\n",
664 memory
.guest_phys_addr
, memory
.memory_size
,
665 memory
.userspace_addr
, memory
.slot
, memory
.flags
);
666 r
= kvm_vm_ioctl(kvm_state
, KVM_SET_USER_MEMORY_REGION
, &memory
);
668 fprintf(stderr
, "create_userspace_phys_mem: %s\n", strerror(-r
));
671 register_slot(memory
.slot
, memory
.guest_phys_addr
, memory
.memory_size
,
672 memory
.userspace_addr
, memory
.flags
);
677 /* destroy/free a whole slot.
678 * phys_start, len and slot are the params passed to kvm_create_phys_mem()
680 void kvm_destroy_phys_mem(kvm_context_t kvm
, unsigned long phys_start
,
685 struct kvm_userspace_memory_region memory
= {
687 .guest_phys_addr
= phys_start
,
692 slot
= get_slot(phys_start
);
694 if ((slot
>= KVM_MAX_NUM_MEM_REGIONS
) || (slot
== -1)) {
695 fprintf(stderr
, "BUG: %s: invalid parameters (slot=%d)\n",
699 if (phys_start
!= slots
[slot
].phys_addr
) {
701 "WARNING: %s: phys_start is 0x%lx expecting 0x%lx\n",
702 __FUNCTION__
, phys_start
, slots
[slot
].phys_addr
);
703 phys_start
= slots
[slot
].phys_addr
;
707 DPRINTF("slot %d start %llx len %llx flags %x\n",
709 memory
.guest_phys_addr
,
712 r
= kvm_vm_ioctl(kvm_state
, KVM_SET_USER_MEMORY_REGION
, &memory
);
714 fprintf(stderr
, "destroy_userspace_phys_mem: %s",
719 free_slot(memory
.slot
);
722 void kvm_unregister_memory_area(kvm_context_t kvm
, uint64_t phys_addr
, unsigned long size
)
725 int slot
= get_container_slot(phys_addr
, size
);
728 DPRINTF("Unregistering memory region %llx (%lx)\n", phys_addr
, size
);
729 kvm_destroy_phys_mem(kvm
, phys_addr
, size
);
734 static int kvm_get_map(kvm_context_t kvm
, int ioctl_num
, int slot
, void *buf
)
737 struct kvm_dirty_log log
= {
741 log
.dirty_bitmap
= buf
;
743 r
= kvm_vm_ioctl(kvm_state
, ioctl_num
, &log
);
749 int kvm_get_dirty_pages(kvm_context_t kvm
, unsigned long phys_addr
, void *buf
)
753 slot
= get_slot(phys_addr
);
754 return kvm_get_map(kvm
, KVM_GET_DIRTY_LOG
, slot
, buf
);
757 int kvm_get_dirty_pages_range(kvm_context_t kvm
, unsigned long phys_addr
,
758 unsigned long len
, void *opaque
,
759 int (*cb
)(unsigned long start
, unsigned long len
,
760 void*bitmap
, void *opaque
))
764 unsigned long end_addr
= phys_addr
+ len
;
767 for (i
= 0; i
< KVM_MAX_NUM_MEM_REGIONS
; ++i
) {
768 if ((slots
[i
].len
&& (uint64_t)slots
[i
].phys_addr
>= phys_addr
)
769 && ((uint64_t)slots
[i
].phys_addr
+ slots
[i
].len
<= end_addr
)) {
770 buf
= qemu_malloc((slots
[i
].len
/ 4096 + 7) / 8 + 2);
771 r
= kvm_get_map(kvm
, KVM_GET_DIRTY_LOG
, i
, buf
);
776 r
= cb(slots
[i
].phys_addr
, slots
[i
].len
, buf
, opaque
);
785 #ifdef KVM_CAP_IRQCHIP
787 int kvm_set_irq_level(kvm_context_t kvm
, int irq
, int level
, int *status
)
789 struct kvm_irq_level event
;
792 if (!kvm
->irqchip_in_kernel
)
796 r
= kvm_vm_ioctl(kvm_state
, kvm
->irqchip_inject_ioctl
, &event
);
798 perror("kvm_set_irq_level");
801 #ifdef KVM_CAP_IRQ_INJECT_STATUS
802 *status
= (kvm
->irqchip_inject_ioctl
== KVM_IRQ_LINE
) ?
812 int kvm_get_irqchip(kvm_context_t kvm
, struct kvm_irqchip
*chip
)
816 if (!kvm
->irqchip_in_kernel
)
818 r
= kvm_vm_ioctl(kvm_state
, KVM_GET_IRQCHIP
, chip
);
820 perror("kvm_get_irqchip\n");
825 int kvm_set_irqchip(kvm_context_t kvm
, struct kvm_irqchip
*chip
)
829 if (!kvm
->irqchip_in_kernel
)
831 r
= kvm_vm_ioctl(kvm_state
, KVM_SET_IRQCHIP
, chip
);
833 perror("kvm_set_irqchip\n");
840 static int handle_io(kvm_vcpu_context_t vcpu
)
842 struct kvm_run
*run
= vcpu
->run
;
843 kvm_context_t kvm
= vcpu
->kvm
;
844 uint16_t addr
= run
->io
.port
;
847 void *p
= (void *)run
+ run
->io
.data_offset
;
849 for (i
= 0; i
< run
->io
.count
; ++i
) {
850 switch (run
->io
.direction
) {
852 switch (run
->io
.size
) {
854 r
= kvm_inb(kvm
->opaque
, addr
, p
);
857 r
= kvm_inw(kvm
->opaque
, addr
, p
);
860 r
= kvm_inl(kvm
->opaque
, addr
, p
);
863 fprintf(stderr
, "bad I/O size %d\n", run
->io
.size
);
867 case KVM_EXIT_IO_OUT
:
868 switch (run
->io
.size
) {
870 r
= kvm_outb(kvm
->opaque
, addr
,
874 r
= kvm_outw(kvm
->opaque
, addr
,
878 r
= kvm_outl(kvm
->opaque
, addr
,
882 fprintf(stderr
, "bad I/O size %d\n", run
->io
.size
);
887 fprintf(stderr
, "bad I/O direction %d\n", run
->io
.direction
);
897 int handle_debug(kvm_vcpu_context_t vcpu
, void *env
)
899 #ifdef KVM_CAP_SET_GUEST_DEBUG
900 struct kvm_run
*run
= vcpu
->run
;
901 kvm_context_t kvm
= vcpu
->kvm
;
903 return kvm_debug(kvm
->opaque
, env
, &run
->debug
.arch
);
909 int kvm_get_regs(kvm_vcpu_context_t vcpu
, struct kvm_regs
*regs
)
911 return ioctl(vcpu
->fd
, KVM_GET_REGS
, regs
);
914 int kvm_set_regs(kvm_vcpu_context_t vcpu
, struct kvm_regs
*regs
)
916 return ioctl(vcpu
->fd
, KVM_SET_REGS
, regs
);
919 int kvm_get_fpu(kvm_vcpu_context_t vcpu
, struct kvm_fpu
*fpu
)
921 return ioctl(vcpu
->fd
, KVM_GET_FPU
, fpu
);
924 int kvm_set_fpu(kvm_vcpu_context_t vcpu
, struct kvm_fpu
*fpu
)
926 return ioctl(vcpu
->fd
, KVM_SET_FPU
, fpu
);
929 int kvm_get_sregs(kvm_vcpu_context_t vcpu
, struct kvm_sregs
*sregs
)
931 return ioctl(vcpu
->fd
, KVM_GET_SREGS
, sregs
);
934 int kvm_set_sregs(kvm_vcpu_context_t vcpu
, struct kvm_sregs
*sregs
)
936 return ioctl(vcpu
->fd
, KVM_SET_SREGS
, sregs
);
939 #ifdef KVM_CAP_MP_STATE
940 int kvm_get_mpstate(kvm_vcpu_context_t vcpu
, struct kvm_mp_state
*mp_state
)
944 r
= kvm_ioctl(kvm_state
, KVM_CHECK_EXTENSION
, KVM_CAP_MP_STATE
);
946 return ioctl(vcpu
->fd
, KVM_GET_MP_STATE
, mp_state
);
950 int kvm_set_mpstate(kvm_vcpu_context_t vcpu
, struct kvm_mp_state
*mp_state
)
954 r
= kvm_ioctl(kvm_state
, KVM_CHECK_EXTENSION
, KVM_CAP_MP_STATE
);
956 return ioctl(vcpu
->fd
, KVM_SET_MP_STATE
, mp_state
);
961 static int handle_mmio(kvm_vcpu_context_t vcpu
)
963 unsigned long addr
= vcpu
->run
->mmio
.phys_addr
;
964 kvm_context_t kvm
= vcpu
->kvm
;
965 struct kvm_run
*kvm_run
= vcpu
->run
;
966 void *data
= kvm_run
->mmio
.data
;
968 /* hack: Red Hat 7.1 generates these weird accesses. */
969 if ((addr
> 0xa0000-4 && addr
<= 0xa0000) && kvm_run
->mmio
.len
== 3)
972 if (kvm_run
->mmio
.is_write
)
973 return kvm_mmio_write(kvm
->opaque
, addr
, data
,
976 return kvm_mmio_read(kvm
->opaque
, addr
, data
,
980 int handle_io_window(kvm_context_t kvm
)
985 int handle_halt(kvm_vcpu_context_t vcpu
)
987 return kvm_arch_halt(vcpu
->kvm
->opaque
, vcpu
);
990 int handle_shutdown(kvm_context_t kvm
, CPUState
*env
)
992 /* stop the current vcpu from going back to guest mode */
993 env
->kvm_cpu_state
.stopped
= 1;
995 qemu_system_reset_request();
999 static inline void push_nmi(kvm_context_t kvm
)
1001 #ifdef KVM_CAP_USER_NMI
1002 kvm_arch_push_nmi(kvm
->opaque
);
1003 #endif /* KVM_CAP_USER_NMI */
1006 void post_kvm_run(kvm_context_t kvm
, CPUState
*env
)
1008 pthread_mutex_lock(&qemu_mutex
);
1009 kvm_arch_post_kvm_run(kvm
->opaque
, env
);
1012 int pre_kvm_run(kvm_context_t kvm
, CPUState
*env
)
1014 kvm_arch_pre_kvm_run(kvm
->opaque
, env
);
1016 if (env
->exit_request
)
1018 pthread_mutex_unlock(&qemu_mutex
);
1022 int kvm_get_interrupt_flag(kvm_vcpu_context_t vcpu
)
1024 return vcpu
->run
->if_flag
;
1027 int kvm_is_ready_for_interrupt_injection(kvm_vcpu_context_t vcpu
)
1029 return vcpu
->run
->ready_for_interrupt_injection
;
1032 int kvm_run(kvm_vcpu_context_t vcpu
, void *env
)
1036 struct kvm_run
*run
= vcpu
->run
;
1037 kvm_context_t kvm
= vcpu
->kvm
;
1041 #if !defined(__s390__)
1042 if (!kvm
->irqchip_in_kernel
)
1043 run
->request_interrupt_window
= kvm_arch_try_push_interrupts(env
);
1045 r
= pre_kvm_run(kvm
, env
);
1048 r
= ioctl(fd
, KVM_RUN
, 0);
1050 if (r
== -1 && errno
!= EINTR
&& errno
!= EAGAIN
) {
1052 post_kvm_run(kvm
, env
);
1053 fprintf(stderr
, "kvm_run: %s\n", strerror(-r
));
1057 post_kvm_run(kvm
, env
);
1059 #if defined(KVM_CAP_COALESCED_MMIO)
1060 if (kvm
->coalesced_mmio
) {
1061 struct kvm_coalesced_mmio_ring
*ring
= (void *)run
+
1062 kvm
->coalesced_mmio
* PAGE_SIZE
;
1063 while (ring
->first
!= ring
->last
) {
1064 kvm_mmio_write(kvm
->opaque
,
1065 ring
->coalesced_mmio
[ring
->first
].phys_addr
,
1066 &ring
->coalesced_mmio
[ring
->first
].data
[0],
1067 ring
->coalesced_mmio
[ring
->first
].len
);
1069 ring
->first
= (ring
->first
+ 1) %
1070 KVM_COALESCED_MMIO_MAX
;
1075 #if !defined(__s390__)
1077 r
= handle_io_window(kvm
);
1082 switch (run
->exit_reason
) {
1083 case KVM_EXIT_UNKNOWN
:
1084 r
= handle_unhandled(run
->hw
.hardware_exit_reason
);
1086 case KVM_EXIT_FAIL_ENTRY
:
1087 r
= handle_unhandled(run
->fail_entry
.hardware_entry_failure_reason
);
1089 case KVM_EXIT_EXCEPTION
:
1090 fprintf(stderr
, "exception %d (%x)\n",
1092 run
->ex
.error_code
);
1093 kvm_show_regs(vcpu
);
1094 kvm_show_code(vcpu
);
1098 r
= handle_io(vcpu
);
1100 case KVM_EXIT_DEBUG
:
1101 r
= handle_debug(vcpu
, env
);
1104 r
= handle_mmio(vcpu
);
1107 r
= handle_halt(vcpu
);
1109 case KVM_EXIT_IRQ_WINDOW_OPEN
:
1111 case KVM_EXIT_SHUTDOWN
:
1112 r
= handle_shutdown(kvm
, env
);
1114 #if defined(__s390__)
1115 case KVM_EXIT_S390_SIEIC
:
1116 r
= kvm_s390_handle_intercept(kvm
, vcpu
,
1119 case KVM_EXIT_S390_RESET
:
1120 r
= kvm_s390_handle_reset(kvm
, vcpu
, run
);
1124 if (kvm_arch_run(vcpu
)) {
1125 fprintf(stderr
, "unhandled vm exit: 0x%x\n",
1127 kvm_show_regs(vcpu
);
1139 int kvm_inject_irq(kvm_vcpu_context_t vcpu
, unsigned irq
)
1141 struct kvm_interrupt intr
;
1144 return ioctl(vcpu
->fd
, KVM_INTERRUPT
, &intr
);
1147 #ifdef KVM_CAP_SET_GUEST_DEBUG
1148 int kvm_set_guest_debug(kvm_vcpu_context_t vcpu
, struct kvm_guest_debug
*dbg
)
1150 return ioctl(vcpu
->fd
, KVM_SET_GUEST_DEBUG
, dbg
);
1154 int kvm_set_signal_mask(kvm_vcpu_context_t vcpu
, const sigset_t
*sigset
)
1156 struct kvm_signal_mask
*sigmask
;
1160 r
= ioctl(vcpu
->fd
, KVM_SET_SIGNAL_MASK
, NULL
);
1165 sigmask
= qemu_malloc(sizeof(*sigmask
) + sizeof(*sigset
));
1168 memcpy(sigmask
->sigset
, sigset
, sizeof(*sigset
));
1169 r
= ioctl(vcpu
->fd
, KVM_SET_SIGNAL_MASK
, sigmask
);
1176 int kvm_irqchip_in_kernel(kvm_context_t kvm
)
1178 return kvm
->irqchip_in_kernel
;
1181 int kvm_pit_in_kernel(kvm_context_t kvm
)
1183 return kvm
->pit_in_kernel
;
1186 int kvm_has_sync_mmu(void)
1189 #ifdef KVM_CAP_SYNC_MMU
1190 r
= kvm_ioctl(kvm_state
, KVM_CHECK_EXTENSION
, KVM_CAP_SYNC_MMU
);
1195 int kvm_inject_nmi(kvm_vcpu_context_t vcpu
)
1197 #ifdef KVM_CAP_USER_NMI
1198 return ioctl(vcpu
->fd
, KVM_NMI
);
1204 int kvm_init_coalesced_mmio(kvm_context_t kvm
)
1207 kvm
->coalesced_mmio
= 0;
1208 #ifdef KVM_CAP_COALESCED_MMIO
1209 r
= kvm_ioctl(kvm_state
, KVM_CHECK_EXTENSION
, KVM_CAP_COALESCED_MMIO
);
1211 kvm
->coalesced_mmio
= r
;
1218 int kvm_coalesce_mmio_region(target_phys_addr_t addr
, ram_addr_t size
)
1220 #ifdef KVM_CAP_COALESCED_MMIO
1221 kvm_context_t kvm
= kvm_context
;
1222 struct kvm_coalesced_mmio_zone zone
;
1225 if (kvm
->coalesced_mmio
) {
1230 r
= kvm_vm_ioctl(kvm_state
, KVM_REGISTER_COALESCED_MMIO
, &zone
);
1232 perror("kvm_register_coalesced_mmio_zone");
1241 int kvm_uncoalesce_mmio_region(target_phys_addr_t addr
, ram_addr_t size
)
1243 #ifdef KVM_CAP_COALESCED_MMIO
1244 kvm_context_t kvm
= kvm_context
;
1245 struct kvm_coalesced_mmio_zone zone
;
1248 if (kvm
->coalesced_mmio
) {
1253 r
= kvm_vm_ioctl(kvm_state
, KVM_UNREGISTER_COALESCED_MMIO
, &zone
);
1255 perror("kvm_unregister_coalesced_mmio_zone");
1258 DPRINTF("Unregistered coalesced mmio region for %llx (%lx)\n", addr
, size
);
1265 #ifdef KVM_CAP_DEVICE_ASSIGNMENT
1266 int kvm_assign_pci_device(kvm_context_t kvm
,
1267 struct kvm_assigned_pci_dev
*assigned_dev
)
1269 return kvm_vm_ioctl(kvm_state
, KVM_ASSIGN_PCI_DEVICE
, assigned_dev
);
1272 static int kvm_old_assign_irq(kvm_context_t kvm
,
1273 struct kvm_assigned_irq
*assigned_irq
)
1275 return kvm_vm_ioctl(kvm_state
, KVM_ASSIGN_IRQ
, assigned_irq
);
1278 #ifdef KVM_CAP_ASSIGN_DEV_IRQ
1279 int kvm_assign_irq(kvm_context_t kvm
,
1280 struct kvm_assigned_irq
*assigned_irq
)
1284 ret
= kvm_ioctl(kvm_state
, KVM_CHECK_EXTENSION
, KVM_CAP_ASSIGN_DEV_IRQ
);
1286 return kvm_vm_ioctl(kvm_state
, KVM_ASSIGN_DEV_IRQ
, assigned_irq
);
1289 return kvm_old_assign_irq(kvm
, assigned_irq
);
1292 int kvm_deassign_irq(kvm_context_t kvm
,
1293 struct kvm_assigned_irq
*assigned_irq
)
1295 return kvm_vm_ioctl(kvm_state
, KVM_DEASSIGN_DEV_IRQ
, assigned_irq
);
1298 int kvm_assign_irq(kvm_context_t kvm
,
1299 struct kvm_assigned_irq
*assigned_irq
)
1301 return kvm_old_assign_irq(kvm
, assigned_irq
);
1306 #ifdef KVM_CAP_DEVICE_DEASSIGNMENT
1307 int kvm_deassign_pci_device(kvm_context_t kvm
,
1308 struct kvm_assigned_pci_dev
*assigned_dev
)
1310 return kvm_vm_ioctl(kvm_state
, KVM_DEASSIGN_PCI_DEVICE
, assigned_dev
);
1314 int kvm_destroy_memory_region_works(kvm_context_t kvm
)
1318 #ifdef KVM_CAP_DESTROY_MEMORY_REGION_WORKS
1319 ret
= kvm_ioctl(kvm_state
, KVM_CHECK_EXTENSION
,
1320 KVM_CAP_DESTROY_MEMORY_REGION_WORKS
);
1327 int kvm_reinject_control(kvm_context_t kvm
, int pit_reinject
)
1329 #ifdef KVM_CAP_REINJECT_CONTROL
1331 struct kvm_reinject_control control
;
1333 control
.pit_reinject
= pit_reinject
;
1335 r
= kvm_ioctl(kvm_state
, KVM_CHECK_EXTENSION
, KVM_CAP_REINJECT_CONTROL
);
1337 return kvm_vm_ioctl(kvm_state
, KVM_REINJECT_CONTROL
, &control
);
1343 int kvm_has_gsi_routing(kvm_context_t kvm
)
1347 #ifdef KVM_CAP_IRQ_ROUTING
1348 r
= kvm_check_extension(kvm
, KVM_CAP_IRQ_ROUTING
);
1353 int kvm_get_gsi_count(kvm_context_t kvm
)
1355 #ifdef KVM_CAP_IRQ_ROUTING
1356 return kvm_check_extension(kvm
, KVM_CAP_IRQ_ROUTING
);
1362 int kvm_clear_gsi_routes(kvm_context_t kvm
)
1364 #ifdef KVM_CAP_IRQ_ROUTING
1365 kvm
->irq_routes
->nr
= 0;
1372 int kvm_add_routing_entry(kvm_context_t kvm
,
1373 struct kvm_irq_routing_entry
* entry
)
1375 #ifdef KVM_CAP_IRQ_ROUTING
1376 struct kvm_irq_routing
*z
;
1377 struct kvm_irq_routing_entry
*new;
1380 if (kvm
->irq_routes
->nr
== kvm
->nr_allocated_irq_routes
) {
1381 n
= kvm
->nr_allocated_irq_routes
* 2;
1384 size
= sizeof(struct kvm_irq_routing
);
1385 size
+= n
* sizeof(*new);
1386 z
= realloc(kvm
->irq_routes
, size
);
1389 kvm
->nr_allocated_irq_routes
= n
;
1390 kvm
->irq_routes
= z
;
1392 n
= kvm
->irq_routes
->nr
++;
1393 new = &kvm
->irq_routes
->entries
[n
];
1394 memset(new, 0, sizeof(*new));
1395 new->gsi
= entry
->gsi
;
1396 new->type
= entry
->type
;
1397 new->flags
= entry
->flags
;
1400 set_gsi(kvm
, entry
->gsi
);
1408 int kvm_add_irq_route(kvm_context_t kvm
, int gsi
, int irqchip
, int pin
)
1410 #ifdef KVM_CAP_IRQ_ROUTING
1411 struct kvm_irq_routing_entry e
;
1414 e
.type
= KVM_IRQ_ROUTING_IRQCHIP
;
1416 e
.u
.irqchip
.irqchip
= irqchip
;
1417 e
.u
.irqchip
.pin
= pin
;
1418 return kvm_add_routing_entry(kvm
, &e
);
1424 int kvm_del_routing_entry(kvm_context_t kvm
,
1425 struct kvm_irq_routing_entry
* entry
)
1427 #ifdef KVM_CAP_IRQ_ROUTING
1428 struct kvm_irq_routing_entry
*e
, *p
;
1429 int i
, gsi
, found
= 0;
1433 for (i
= 0; i
< kvm
->irq_routes
->nr
; ++i
) {
1434 e
= &kvm
->irq_routes
->entries
[i
];
1435 if (e
->type
== entry
->type
1439 case KVM_IRQ_ROUTING_IRQCHIP
: {
1440 if (e
->u
.irqchip
.irqchip
==
1441 entry
->u
.irqchip
.irqchip
1442 && e
->u
.irqchip
.pin
==
1443 entry
->u
.irqchip
.pin
) {
1444 p
= &kvm
->irq_routes
->
1445 entries
[--kvm
->irq_routes
->nr
];
1451 case KVM_IRQ_ROUTING_MSI
: {
1452 if (e
->u
.msi
.address_lo
==
1453 entry
->u
.msi
.address_lo
1454 && e
->u
.msi
.address_hi
==
1455 entry
->u
.msi
.address_hi
1456 && e
->u
.msi
.data
== entry
->u
.msi
.data
) {
1457 p
= &kvm
->irq_routes
->
1458 entries
[--kvm
->irq_routes
->nr
];
1468 /* If there are no other users of this GSI
1469 * mark it available in the bitmap */
1470 for (i
= 0; i
< kvm
->irq_routes
->nr
; i
++) {
1471 e
= &kvm
->irq_routes
->entries
[i
];
1475 if (i
== kvm
->irq_routes
->nr
)
1476 clear_gsi(kvm
, gsi
);
1488 int kvm_update_routing_entry(kvm_context_t kvm
,
1489 struct kvm_irq_routing_entry
* entry
,
1490 struct kvm_irq_routing_entry
* newentry
)
1492 #ifdef KVM_CAP_IRQ_ROUTING
1493 struct kvm_irq_routing_entry
*e
;
1496 if (entry
->gsi
!= newentry
->gsi
||
1497 entry
->type
!= newentry
->type
) {
1501 for (i
= 0; i
< kvm
->irq_routes
->nr
; ++i
) {
1502 e
= &kvm
->irq_routes
->entries
[i
];
1503 if (e
->type
!= entry
->type
|| e
->gsi
!= entry
->gsi
) {
1507 case KVM_IRQ_ROUTING_IRQCHIP
:
1508 if (e
->u
.irqchip
.irqchip
== entry
->u
.irqchip
.irqchip
&&
1509 e
->u
.irqchip
.pin
== entry
->u
.irqchip
.pin
) {
1510 memcpy(&e
->u
.irqchip
, &entry
->u
.irqchip
, sizeof e
->u
.irqchip
);
1514 case KVM_IRQ_ROUTING_MSI
:
1515 if (e
->u
.msi
.address_lo
== entry
->u
.msi
.address_lo
&&
1516 e
->u
.msi
.address_hi
== entry
->u
.msi
.address_hi
&&
1517 e
->u
.msi
.data
== entry
->u
.msi
.data
) {
1518 memcpy(&e
->u
.msi
, &entry
->u
.msi
, sizeof e
->u
.msi
);
1532 int kvm_del_irq_route(kvm_context_t kvm
, int gsi
, int irqchip
, int pin
)
1534 #ifdef KVM_CAP_IRQ_ROUTING
1535 struct kvm_irq_routing_entry e
;
1538 e
.type
= KVM_IRQ_ROUTING_IRQCHIP
;
1540 e
.u
.irqchip
.irqchip
= irqchip
;
1541 e
.u
.irqchip
.pin
= pin
;
1542 return kvm_del_routing_entry(kvm
, &e
);
1548 int kvm_commit_irq_routes(kvm_context_t kvm
)
1550 #ifdef KVM_CAP_IRQ_ROUTING
1551 kvm
->irq_routes
->flags
= 0;
1552 return kvm_vm_ioctl(kvm_state
, KVM_SET_GSI_ROUTING
, kvm
->irq_routes
);
1558 int kvm_get_irq_route_gsi(kvm_context_t kvm
)
1561 uint32_t *buf
= kvm
->used_gsi_bitmap
;
1563 /* Return the lowest unused GSI in the bitmap */
1564 for (i
= 0; i
< kvm
->max_gsi
/ 32; i
++) {
1569 return bit
- 1 + i
* 32;
1575 #ifdef KVM_CAP_DEVICE_MSIX
1576 int kvm_assign_set_msix_nr(kvm_context_t kvm
,
1577 struct kvm_assigned_msix_nr
*msix_nr
)
1579 return kvm_vm_ioctl(kvm_state
, KVM_ASSIGN_SET_MSIX_NR
, msix_nr
);
1582 int kvm_assign_set_msix_entry(kvm_context_t kvm
,
1583 struct kvm_assigned_msix_entry
*entry
)
1585 return kvm_vm_ioctl(kvm_state
, KVM_ASSIGN_SET_MSIX_ENTRY
, entry
);
1589 #if defined(KVM_CAP_IRQFD) && defined(CONFIG_eventfd)
1591 #include <sys/eventfd.h>
1593 static int _kvm_irqfd(kvm_context_t kvm
, int fd
, int gsi
, int flags
)
1595 struct kvm_irqfd data
= {
1601 return kvm_vm_ioctl(kvm_state
, KVM_IRQFD
, &data
);
1604 int kvm_irqfd(kvm_context_t kvm
, int gsi
, int flags
)
1609 if (!kvm_check_extension(kvm
, KVM_CAP_IRQFD
))
1616 r
= _kvm_irqfd(kvm
, fd
, gsi
, 0);
1625 #else /* KVM_CAP_IRQFD */
1627 int kvm_irqfd(kvm_context_t kvm
, int gsi
, int flags
)
1632 #endif /* KVM_CAP_IRQFD */
1633 static inline unsigned long kvm_get_thread_id(void)
1635 return syscall(SYS_gettid
);
1638 static void qemu_cond_wait(pthread_cond_t
*cond
)
1640 CPUState
*env
= cpu_single_env
;
1641 static const struct timespec ts
= {
1646 pthread_cond_timedwait(cond
, &qemu_mutex
, &ts
);
1647 cpu_single_env
= env
;
1650 static void sig_ipi_handler(int n
)
1654 static void on_vcpu(CPUState
*env
, void (*func
)(void *data
), void *data
)
1656 struct qemu_work_item wi
;
1658 if (env
== current_env
) {
1665 if (!env
->kvm_cpu_state
.queued_work_first
)
1666 env
->kvm_cpu_state
.queued_work_first
= &wi
;
1668 env
->kvm_cpu_state
.queued_work_last
->next
= &wi
;
1669 env
->kvm_cpu_state
.queued_work_last
= &wi
;
1673 pthread_kill(env
->kvm_cpu_state
.thread
, SIG_IPI
);
1675 qemu_cond_wait(&qemu_work_cond
);
1678 static void inject_interrupt(void *data
)
1680 cpu_interrupt(current_env
, (long)data
);
1683 void kvm_inject_interrupt(CPUState
*env
, int mask
)
1685 on_vcpu(env
, inject_interrupt
, (void *)(long)mask
);
1688 void kvm_update_interrupt_request(CPUState
*env
)
1693 if (!current_env
|| !current_env
->kvm_cpu_state
.created
)
1696 * Testing for created here is really redundant
1698 if (current_env
&& current_env
->kvm_cpu_state
.created
&&
1699 env
!= current_env
&& !env
->kvm_cpu_state
.signalled
)
1703 env
->kvm_cpu_state
.signalled
= 1;
1704 if (env
->kvm_cpu_state
.thread
)
1705 pthread_kill(env
->kvm_cpu_state
.thread
, SIG_IPI
);
1710 static void kvm_do_load_registers(void *_env
)
1712 CPUState
*env
= _env
;
1714 kvm_arch_load_regs(env
);
1717 void kvm_load_registers(CPUState
*env
)
1719 if (kvm_enabled() && qemu_system_ready
)
1720 on_vcpu(env
, kvm_do_load_registers
, env
);
1723 static void kvm_do_save_registers(void *_env
)
1725 CPUState
*env
= _env
;
1727 kvm_arch_save_regs(env
);
1730 void kvm_save_registers(CPUState
*env
)
1733 on_vcpu(env
, kvm_do_save_registers
, env
);
1736 static void kvm_do_load_mpstate(void *_env
)
1738 CPUState
*env
= _env
;
1740 kvm_arch_load_mpstate(env
);
1743 void kvm_load_mpstate(CPUState
*env
)
1745 if (kvm_enabled() && qemu_system_ready
)
1746 on_vcpu(env
, kvm_do_load_mpstate
, env
);
1749 static void kvm_do_save_mpstate(void *_env
)
1751 CPUState
*env
= _env
;
1753 kvm_arch_save_mpstate(env
);
1754 env
->halted
= (env
->mp_state
== KVM_MP_STATE_HALTED
);
1757 void kvm_save_mpstate(CPUState
*env
)
1760 on_vcpu(env
, kvm_do_save_mpstate
, env
);
1763 int kvm_cpu_exec(CPUState
*env
)
1767 r
= kvm_run(env
->kvm_cpu_state
.vcpu_ctx
, env
);
1769 printf("kvm_run returned %d\n", r
);
1776 static int is_cpu_stopped(CPUState
*env
)
1778 return !vm_running
|| env
->kvm_cpu_state
.stopped
;
1781 static void flush_queued_work(CPUState
*env
)
1783 struct qemu_work_item
*wi
;
1785 if (!env
->kvm_cpu_state
.queued_work_first
)
1788 while ((wi
= env
->kvm_cpu_state
.queued_work_first
)) {
1789 env
->kvm_cpu_state
.queued_work_first
= wi
->next
;
1793 env
->kvm_cpu_state
.queued_work_last
= NULL
;
1794 pthread_cond_broadcast(&qemu_work_cond
);
1797 static void kvm_main_loop_wait(CPUState
*env
, int timeout
)
1804 pthread_mutex_unlock(&qemu_mutex
);
1806 ts
.tv_sec
= timeout
/ 1000;
1807 ts
.tv_nsec
= (timeout
% 1000) * 1000000;
1808 sigemptyset(&waitset
);
1809 sigaddset(&waitset
, SIG_IPI
);
1811 r
= sigtimedwait(&waitset
, &siginfo
, &ts
);
1814 pthread_mutex_lock(&qemu_mutex
);
1816 if (r
== -1 && !(e
== EAGAIN
|| e
== EINTR
)) {
1817 printf("sigtimedwait: %s\n", strerror(e
));
1821 cpu_single_env
= env
;
1822 flush_queued_work(env
);
1824 if (env
->kvm_cpu_state
.stop
) {
1825 env
->kvm_cpu_state
.stop
= 0;
1826 env
->kvm_cpu_state
.stopped
= 1;
1827 pthread_cond_signal(&qemu_pause_cond
);
1830 env
->kvm_cpu_state
.signalled
= 0;
1833 static int all_threads_paused(void)
1835 CPUState
*penv
= first_cpu
;
1838 if (penv
->kvm_cpu_state
.stop
)
1840 penv
= (CPUState
*)penv
->next_cpu
;
1846 static void pause_all_threads(void)
1848 CPUState
*penv
= first_cpu
;
1851 if (penv
!= cpu_single_env
) {
1852 penv
->kvm_cpu_state
.stop
= 1;
1853 pthread_kill(penv
->kvm_cpu_state
.thread
, SIG_IPI
);
1855 penv
->kvm_cpu_state
.stop
= 0;
1856 penv
->kvm_cpu_state
.stopped
= 1;
1859 penv
= (CPUState
*)penv
->next_cpu
;
1862 while (!all_threads_paused())
1863 qemu_cond_wait(&qemu_pause_cond
);
1866 static void resume_all_threads(void)
1868 CPUState
*penv
= first_cpu
;
1870 assert(!cpu_single_env
);
1873 penv
->kvm_cpu_state
.stop
= 0;
1874 penv
->kvm_cpu_state
.stopped
= 0;
1875 pthread_kill(penv
->kvm_cpu_state
.thread
, SIG_IPI
);
1876 penv
= (CPUState
*)penv
->next_cpu
;
1880 static void kvm_vm_state_change_handler(void *context
, int running
, int reason
)
1883 resume_all_threads();
1885 pause_all_threads();
1888 static void setup_kernel_sigmask(CPUState
*env
)
1893 sigaddset(&set
, SIGUSR2
);
1894 sigaddset(&set
, SIGIO
);
1895 sigaddset(&set
, SIGALRM
);
1896 sigprocmask(SIG_BLOCK
, &set
, NULL
);
1898 sigprocmask(SIG_BLOCK
, NULL
, &set
);
1899 sigdelset(&set
, SIG_IPI
);
1901 kvm_set_signal_mask(env
->kvm_cpu_state
.vcpu_ctx
, &set
);
1904 static void qemu_kvm_system_reset(void)
1906 CPUState
*penv
= first_cpu
;
1908 pause_all_threads();
1910 qemu_system_reset();
1913 kvm_arch_cpu_reset(penv
);
1914 penv
= (CPUState
*)penv
->next_cpu
;
1917 resume_all_threads();
1920 static void process_irqchip_events(CPUState
*env
)
1922 kvm_arch_process_irqchip_events(env
);
1923 if (kvm_arch_has_work(env
))
1927 static int kvm_main_loop_cpu(CPUState
*env
)
1929 setup_kernel_sigmask(env
);
1931 pthread_mutex_lock(&qemu_mutex
);
1933 kvm_qemu_init_env(env
);
1935 kvm_tpr_vcpu_start(env
);
1938 cpu_single_env
= env
;
1939 kvm_arch_load_regs(env
);
1942 int run_cpu
= !is_cpu_stopped(env
);
1943 if (run_cpu
&& !kvm_irqchip_in_kernel(kvm_context
)) {
1944 process_irqchip_events(env
);
1945 run_cpu
= !env
->halted
;
1948 kvm_main_loop_wait(env
, 0);
1951 kvm_main_loop_wait(env
, 1000);
1954 pthread_mutex_unlock(&qemu_mutex
);
1958 static void *ap_main_loop(void *_env
)
1960 CPUState
*env
= _env
;
1962 struct ioperm_data
*data
= NULL
;
1965 env
->thread_id
= kvm_get_thread_id();
1966 sigfillset(&signals
);
1967 sigprocmask(SIG_BLOCK
, &signals
, NULL
);
1968 env
->kvm_cpu_state
.vcpu_ctx
= kvm_create_vcpu(env
, env
->cpu_index
);
1970 #ifdef USE_KVM_DEVICE_ASSIGNMENT
1971 /* do ioperm for io ports of assigned devices */
1972 LIST_FOREACH(data
, &ioperm_head
, entries
)
1973 on_vcpu(env
, kvm_arch_do_ioperm
, data
);
1976 /* signal VCPU creation */
1977 pthread_mutex_lock(&qemu_mutex
);
1978 current_env
->kvm_cpu_state
.created
= 1;
1979 pthread_cond_signal(&qemu_vcpu_cond
);
1981 /* and wait for machine initialization */
1982 while (!qemu_system_ready
)
1983 qemu_cond_wait(&qemu_system_cond
);
1984 pthread_mutex_unlock(&qemu_mutex
);
1986 kvm_main_loop_cpu(env
);
1990 void kvm_init_vcpu(CPUState
*env
)
1992 pthread_create(&env
->kvm_cpu_state
.thread
, NULL
, ap_main_loop
, env
);
1994 while (env
->kvm_cpu_state
.created
== 0)
1995 qemu_cond_wait(&qemu_vcpu_cond
);
1998 int kvm_vcpu_inited(CPUState
*env
)
2000 return env
->kvm_cpu_state
.created
;
2004 void kvm_hpet_disable_kpit(void)
2006 struct kvm_pit_state2 ps2
;
2008 kvm_get_pit2(kvm_context
, &ps2
);
2009 ps2
.flags
|= KVM_PIT_FLAGS_HPET_LEGACY
;
2010 kvm_set_pit2(kvm_context
, &ps2
);
2013 void kvm_hpet_enable_kpit(void)
2015 struct kvm_pit_state2 ps2
;
2017 kvm_get_pit2(kvm_context
, &ps2
);
2018 ps2
.flags
&= ~KVM_PIT_FLAGS_HPET_LEGACY
;
2019 kvm_set_pit2(kvm_context
, &ps2
);
2023 int kvm_init_ap(void)
2026 kvm_tpr_opt_setup();
2028 qemu_add_vm_change_state_handler(kvm_vm_state_change_handler
, NULL
);
2030 signal(SIG_IPI
, sig_ipi_handler
);
2034 void qemu_kvm_notify_work(void)
2040 if (io_thread_fd
== -1)
2043 memcpy(buffer
, &value
, sizeof(value
));
2045 while (offset
< 8) {
2048 len
= write(io_thread_fd
, buffer
+ offset
, 8 - offset
);
2049 if (len
== -1 && errno
== EINTR
)
2052 /* In case we have a pipe, there is not reason to insist writing
2055 if (len
== -1 && errno
== EAGAIN
)
2065 /* If we have signalfd, we mask out the signals we want to handle and then
2066 * use signalfd to listen for them. We rely on whatever the current signal
2067 * handler is to dispatch the signals when we receive them.
2070 static void sigfd_handler(void *opaque
)
2072 int fd
= (unsigned long)opaque
;
2073 struct qemu_signalfd_siginfo info
;
2074 struct sigaction action
;
2079 len
= read(fd
, &info
, sizeof(info
));
2080 } while (len
== -1 && errno
== EINTR
);
2082 if (len
== -1 && errno
== EAGAIN
)
2085 if (len
!= sizeof(info
)) {
2086 printf("read from sigfd returned %zd: %m\n", len
);
2090 sigaction(info
.ssi_signo
, NULL
, &action
);
2091 if (action
.sa_handler
)
2092 action
.sa_handler(info
.ssi_signo
);
2097 /* Used to break IO thread out of select */
2098 static void io_thread_wakeup(void *opaque
)
2100 int fd
= (unsigned long)opaque
;
2103 /* Drain the pipe/(eventfd) */
2107 len
= read(fd
, buffer
, sizeof(buffer
));
2108 if (len
== -1 && errno
== EINTR
)
2116 int kvm_main_loop(void)
2122 io_thread
= pthread_self();
2123 qemu_system_ready
= 1;
2125 if (qemu_eventfd(fds
) == -1) {
2126 fprintf(stderr
, "failed to create eventfd\n");
2130 fcntl(fds
[0], F_SETFL
, O_NONBLOCK
);
2131 fcntl(fds
[1], F_SETFL
, O_NONBLOCK
);
2133 qemu_set_fd_handler2(fds
[0], NULL
, io_thread_wakeup
, NULL
,
2134 (void *)(unsigned long)fds
[0]);
2136 io_thread_fd
= fds
[1];
2139 sigaddset(&mask
, SIGIO
);
2140 sigaddset(&mask
, SIGALRM
);
2141 sigprocmask(SIG_BLOCK
, &mask
, NULL
);
2143 sigfd
= qemu_signalfd(&mask
);
2145 fprintf(stderr
, "failed to create signalfd\n");
2149 fcntl(sigfd
, F_SETFL
, O_NONBLOCK
);
2151 qemu_set_fd_handler2(sigfd
, NULL
, sigfd_handler
, NULL
,
2152 (void *)(unsigned long)sigfd
);
2154 pthread_cond_broadcast(&qemu_system_cond
);
2156 io_thread_sigfd
= sigfd
;
2157 cpu_single_env
= NULL
;
2160 main_loop_wait(1000);
2161 if (qemu_shutdown_requested()) {
2162 if (qemu_no_shutdown()) {
2166 } else if (qemu_powerdown_requested())
2167 qemu_system_powerdown();
2168 else if (qemu_reset_requested())
2169 qemu_kvm_system_reset();
2170 else if (kvm_debug_cpu_requested
) {
2171 gdb_set_stop_cpu(kvm_debug_cpu_requested
);
2172 vm_stop(EXCP_DEBUG
);
2173 kvm_debug_cpu_requested
= NULL
;
2177 pause_all_threads();
2178 pthread_mutex_unlock(&qemu_mutex
);
2184 static int destroy_region_works
= 0;
2188 #if !defined(TARGET_I386)
2189 int kvm_arch_init_irq_routing(void)
2195 int kvm_qemu_create_context(void)
2200 kvm_disable_irqchip_creation(kvm_context
);
2203 kvm_disable_pit_creation(kvm_context
);
2205 if (kvm_create(kvm_context
, 0, NULL
) < 0) {
2206 kvm_finalize(kvm_state
);
2209 r
= kvm_arch_qemu_create_context();
2211 kvm_finalize(kvm_state
);
2212 if (kvm_pit
&& !kvm_pit_reinject
) {
2213 if (kvm_reinject_control(kvm_context
, 0)) {
2214 fprintf(stderr
, "failure to disable in-kernel PIT reinjection\n");
2219 destroy_region_works
= kvm_destroy_memory_region_works(kvm_context
);
2222 r
= kvm_arch_init_irq_routing();
2231 static int must_use_aliases_source(target_phys_addr_t addr
)
2233 if (destroy_region_works
)
2235 if (addr
== 0xa0000 || addr
== 0xa8000)
2240 static int must_use_aliases_target(target_phys_addr_t addr
)
2242 if (destroy_region_works
)
2244 if (addr
>= 0xe0000000 && addr
< 0x100000000ull
)
2249 static struct mapping
{
2250 target_phys_addr_t phys
;
2254 static int nr_mappings
;
2256 static struct mapping
*find_ram_mapping(ram_addr_t ram_addr
)
2260 for (p
= mappings
; p
< mappings
+ nr_mappings
; ++p
) {
2261 if (p
->ram
<= ram_addr
&& ram_addr
< p
->ram
+ p
->len
) {
2268 static struct mapping
*find_mapping(target_phys_addr_t start_addr
)
2272 for (p
= mappings
; p
< mappings
+ nr_mappings
; ++p
) {
2273 if (p
->phys
<= start_addr
&& start_addr
< p
->phys
+ p
->len
) {
2280 static void drop_mapping(target_phys_addr_t start_addr
)
2282 struct mapping
*p
= find_mapping(start_addr
);
2285 *p
= mappings
[--nr_mappings
];
2289 void kvm_set_phys_mem(target_phys_addr_t start_addr
, ram_addr_t size
,
2290 ram_addr_t phys_offset
)
2293 unsigned long area_flags
;
2298 if (start_addr
+ size
> phys_ram_size
) {
2299 phys_ram_size
= start_addr
+ size
;
2302 phys_offset
&= ~IO_MEM_ROM
;
2303 area_flags
= phys_offset
& ~TARGET_PAGE_MASK
;
2305 if (area_flags
!= IO_MEM_RAM
) {
2307 if (must_use_aliases_source(start_addr
)) {
2308 kvm_destroy_memory_alias(kvm_context
, start_addr
);
2311 if (must_use_aliases_target(start_addr
))
2315 p
= find_mapping(start_addr
);
2317 kvm_unregister_memory_area(kvm_context
, p
->phys
, p
->len
);
2318 drop_mapping(p
->phys
);
2320 start_addr
+= TARGET_PAGE_SIZE
;
2321 if (size
> TARGET_PAGE_SIZE
) {
2322 size
-= TARGET_PAGE_SIZE
;
2330 r
= kvm_is_containing_region(kvm_context
, start_addr
, size
);
2334 if (area_flags
>= TLB_MMIO
)
2338 if (must_use_aliases_source(start_addr
)) {
2339 p
= find_ram_mapping(phys_offset
);
2341 kvm_create_memory_alias(kvm_context
, start_addr
, size
,
2342 p
->phys
+ (phys_offset
- p
->ram
));
2348 r
= kvm_register_phys_mem(kvm_context
, start_addr
,
2349 qemu_get_ram_ptr(phys_offset
),
2352 printf("kvm_cpu_register_physical_memory: failed\n");
2357 drop_mapping(start_addr
);
2358 p
= &mappings
[nr_mappings
++];
2359 p
->phys
= start_addr
;
2360 p
->ram
= phys_offset
;
2367 int kvm_setup_guest_memory(void *area
, unsigned long size
)
2371 #ifdef MADV_DONTFORK
2372 if (kvm_enabled() && !kvm_has_sync_mmu())
2373 ret
= madvise(area
, size
, MADV_DONTFORK
);
2382 int kvm_qemu_check_extension(int ext
)
2384 return kvm_check_extension(kvm_context
, ext
);
2387 int kvm_qemu_init_env(CPUState
*cenv
)
2389 return kvm_arch_qemu_init_env(cenv
);
2392 #ifdef KVM_CAP_SET_GUEST_DEBUG
2394 struct kvm_set_guest_debug_data
{
2395 struct kvm_guest_debug dbg
;
2399 static void kvm_invoke_set_guest_debug(void *data
)
2401 struct kvm_set_guest_debug_data
*dbg_data
= data
;
2403 dbg_data
->err
= kvm_set_guest_debug(cpu_single_env
->kvm_cpu_state
.vcpu_ctx
,
2407 int kvm_update_guest_debug(CPUState
*env
, unsigned long reinject_trap
)
2409 struct kvm_set_guest_debug_data data
;
2411 data
.dbg
.control
= 0;
2412 if (env
->singlestep_enabled
)
2413 data
.dbg
.control
= KVM_GUESTDBG_ENABLE
| KVM_GUESTDBG_SINGLESTEP
;
2415 kvm_arch_update_guest_debug(env
, &data
.dbg
);
2416 data
.dbg
.control
|= reinject_trap
;
2418 on_vcpu(env
, kvm_invoke_set_guest_debug
, &data
);
2425 * dirty pages logging
2427 /* FIXME: use unsigned long pointer instead of unsigned char */
2428 unsigned char *kvm_dirty_bitmap
= NULL
;
2429 int kvm_physical_memory_set_dirty_tracking(int enable
)
2437 if (!kvm_dirty_bitmap
) {
2438 unsigned bitmap_size
= BITMAP_SIZE(phys_ram_size
);
2439 kvm_dirty_bitmap
= qemu_malloc(bitmap_size
);
2440 if (kvm_dirty_bitmap
== NULL
) {
2441 perror("Failed to allocate dirty pages bitmap");
2445 r
= kvm_dirty_pages_log_enable_all(kvm_context
);
2450 if (kvm_dirty_bitmap
) {
2451 r
= kvm_dirty_pages_log_reset(kvm_context
);
2452 qemu_free(kvm_dirty_bitmap
);
2453 kvm_dirty_bitmap
= NULL
;
2459 /* get kvm's dirty pages bitmap and update qemu's */
2460 static int kvm_get_dirty_pages_log_range(unsigned long start_addr
,
2461 unsigned char *bitmap
,
2462 unsigned long offset
,
2463 unsigned long mem_size
)
2465 unsigned int i
, j
, n
=0;
2467 unsigned long page_number
, addr
, addr1
;
2468 ram_addr_t ram_addr
;
2469 unsigned int len
= ((mem_size
/TARGET_PAGE_SIZE
) + 7) / 8;
2472 * bitmap-traveling is faster than memory-traveling (for addr...)
2473 * especially when most of the memory is not dirty.
2475 for (i
=0; i
<len
; i
++) {
2480 page_number
= i
* 8 + j
;
2481 addr1
= page_number
* TARGET_PAGE_SIZE
;
2482 addr
= offset
+ addr1
;
2483 ram_addr
= cpu_get_physical_page_desc(addr
);
2484 cpu_physical_memory_set_dirty(ram_addr
);
2490 static int kvm_get_dirty_bitmap_cb(unsigned long start
, unsigned long len
,
2491 void *bitmap
, void *opaque
)
2493 return kvm_get_dirty_pages_log_range(start
, bitmap
, start
, len
);
2497 * get kvm's dirty pages bitmap and update qemu's
2498 * we only care about physical ram, which resides in slots 0 and 3
2500 int kvm_update_dirty_pages_log(void)
2505 r
= kvm_get_dirty_pages_range(kvm_context
, 0, -1UL,
2507 kvm_get_dirty_bitmap_cb
);
2511 void kvm_qemu_log_memory(target_phys_addr_t start
, target_phys_addr_t size
,
2515 kvm_dirty_pages_log_enable_slot(kvm_context
, start
, size
);
2518 if (must_use_aliases_target(start
))
2521 kvm_dirty_pages_log_disable_slot(kvm_context
, start
, size
);
2525 int kvm_get_phys_ram_page_bitmap(unsigned char *bitmap
)
2527 unsigned int bsize
= BITMAP_SIZE(phys_ram_size
);
2528 unsigned int brsize
= BITMAP_SIZE(ram_size
);
2529 unsigned int extra_pages
= (phys_ram_size
- ram_size
) / TARGET_PAGE_SIZE
;
2530 unsigned int extra_bytes
= (extra_pages
+7)/8;
2531 unsigned int hole_start
= BITMAP_SIZE(0xa0000);
2532 unsigned int hole_end
= BITMAP_SIZE(0xc0000);
2534 memset(bitmap
, 0xFF, brsize
+ extra_bytes
);
2535 memset(bitmap
+ hole_start
, 0, hole_end
- hole_start
);
2536 memset(bitmap
+ brsize
+ extra_bytes
, 0, bsize
- brsize
- extra_bytes
);
2541 #ifdef KVM_CAP_IRQCHIP
2543 int kvm_set_irq(int irq
, int level
, int *status
)
2545 return kvm_set_irq_level(kvm_context
, irq
, level
, status
);
2550 int qemu_kvm_get_dirty_pages(unsigned long phys_addr
, void *buf
)
2552 return kvm_get_dirty_pages(kvm_context
, phys_addr
, buf
);
2555 void kvm_mutex_unlock(void)
2557 assert(!cpu_single_env
);
2558 pthread_mutex_unlock(&qemu_mutex
);
2561 void kvm_mutex_lock(void)
2563 pthread_mutex_lock(&qemu_mutex
);
2564 cpu_single_env
= NULL
;
2567 #ifdef USE_KVM_DEVICE_ASSIGNMENT
2568 void kvm_add_ioperm_data(struct ioperm_data
*data
)
2570 LIST_INSERT_HEAD(&ioperm_head
, data
, entries
);
2573 void kvm_remove_ioperm_data(unsigned long start_port
, unsigned long num
)
2575 struct ioperm_data
*data
;
2577 data
= LIST_FIRST(&ioperm_head
);
2579 struct ioperm_data
*next
= LIST_NEXT(data
, entries
);
2581 if (data
->start_port
== start_port
&& data
->num
== num
) {
2582 LIST_REMOVE(data
, entries
);
2590 void kvm_ioperm(CPUState
*env
, void *data
)
2592 if (kvm_enabled() && qemu_system_ready
)
2593 on_vcpu(env
, kvm_arch_do_ioperm
, data
);
2598 int kvm_physical_sync_dirty_bitmap(target_phys_addr_t start_addr
, target_phys_addr_t end_addr
)
2603 if (must_use_aliases_source(start_addr
))
2607 kvm_get_dirty_pages_range(kvm_context
, start_addr
, end_addr
- start_addr
,
2608 NULL
, kvm_get_dirty_bitmap_cb
);
2613 int kvm_log_start(target_phys_addr_t phys_addr
, target_phys_addr_t len
)
2616 if (must_use_aliases_source(phys_addr
))
2621 kvm_qemu_log_memory(phys_addr
, len
, 1);
2626 int kvm_log_stop(target_phys_addr_t phys_addr
, target_phys_addr_t len
)
2629 if (must_use_aliases_source(phys_addr
))
2634 kvm_qemu_log_memory(phys_addr
, len
, 0);
2639 void qemu_kvm_cpu_stop(CPUState
*env
)
2642 env
->kvm_cpu_state
.stopped
= 1;
2645 int kvm_set_boot_cpu_id(uint32_t id
)
2647 return kvm_set_boot_vcpu_id(kvm_context
, id
);
2652 struct kvm_x86_mce_data
2655 struct kvm_x86_mce
*mce
;
2658 static void kvm_do_inject_x86_mce(void *_data
)
2660 struct kvm_x86_mce_data
*data
= _data
;
2663 r
= kvm_set_mce(data
->env
->kvm_cpu_state
.vcpu_ctx
, data
->mce
);
2665 perror("kvm_set_mce FAILED");
2669 void kvm_inject_x86_mce(CPUState
*cenv
, int bank
, uint64_t status
,
2670 uint64_t mcg_status
, uint64_t addr
, uint64_t misc
)
2673 struct kvm_x86_mce mce
= {
2676 .mcg_status
= mcg_status
,
2680 struct kvm_x86_mce_data data
= {
2685 on_vcpu(cenv
, kvm_do_inject_x86_mce
, &data
);