4 * Copyright (C) 2006-2008 Qumranet Technologies
6 * Licensed under the terms of the GNU GPL version 2 or higher.
9 #include "config-host.h"
19 #include "qemu-common.h"
26 #include <sys/utsname.h>
27 #include <sys/syscall.h>
29 extern void perror(const char *s
);
31 kvm_context_t kvm_context
;
35 pthread_mutex_t qemu_mutex
= PTHREAD_MUTEX_INITIALIZER
;
36 pthread_cond_t qemu_aio_cond
= PTHREAD_COND_INITIALIZER
;
37 pthread_cond_t qemu_vcpu_cond
= PTHREAD_COND_INITIALIZER
;
38 pthread_cond_t qemu_system_cond
= PTHREAD_COND_INITIALIZER
;
39 pthread_cond_t qemu_pause_cond
= PTHREAD_COND_INITIALIZER
;
40 __thread
struct vcpu_info
*vcpu
;
42 static int qemu_system_ready
;
44 #define SIG_IPI (SIGRTMIN+4)
58 static int io_thread_fd
= -1;
59 static int io_thread_sigfd
= -1;
61 static inline unsigned long kvm_get_thread_id(void)
63 return syscall(SYS_gettid
);
66 static void qemu_cond_wait(pthread_cond_t
*cond
)
68 CPUState
*env
= cpu_single_env
;
70 pthread_cond_wait(cond
, &qemu_mutex
);
74 CPUState
*qemu_kvm_cpu_env(int index
)
76 return vcpu_info
[index
].env
;
79 static void sig_ipi_handler(int n
)
83 void kvm_update_interrupt_request(CPUState
*env
)
90 if (vcpu
&& env
!= vcpu
->env
&& !vcpu_info
[env
->cpu_index
].signalled
)
94 vcpu_info
[env
->cpu_index
].signalled
= 1;
95 if (vcpu_info
[env
->cpu_index
].thread
)
96 pthread_kill(vcpu_info
[env
->cpu_index
].thread
, SIG_IPI
);
101 void kvm_update_after_sipi(CPUState
*env
)
103 vcpu_info
[env
->cpu_index
].sipi_needed
= 1;
104 kvm_update_interrupt_request(env
);
107 void kvm_apic_init(CPUState
*env
)
109 if (env
->cpu_index
!= 0)
110 vcpu_info
[env
->cpu_index
].init
= 1;
111 kvm_update_interrupt_request(env
);
116 static int try_push_interrupts(void *opaque
)
118 return kvm_arch_try_push_interrupts(opaque
);
121 static void post_kvm_run(void *opaque
, int vcpu
)
124 pthread_mutex_lock(&qemu_mutex
);
125 kvm_arch_post_kvm_run(opaque
, vcpu
);
128 static int pre_kvm_run(void *opaque
, int vcpu
)
130 CPUState
*env
= qemu_kvm_cpu_env(vcpu
);
132 kvm_arch_pre_kvm_run(opaque
, vcpu
);
134 if (env
->interrupt_request
& CPU_INTERRUPT_EXIT
)
136 pthread_mutex_unlock(&qemu_mutex
);
140 void kvm_load_registers(CPUState
*env
)
143 kvm_arch_load_regs(env
);
146 void kvm_save_registers(CPUState
*env
)
149 kvm_arch_save_regs(env
);
152 int kvm_cpu_exec(CPUState
*env
)
156 r
= kvm_run(kvm_context
, env
->cpu_index
);
158 printf("kvm_run returned %d\n", r
);
165 extern int vm_running
;
167 static int has_work(CPUState
*env
)
169 if (!vm_running
|| (env
&& vcpu_info
[env
->cpu_index
].stopped
))
171 if (!(env
->hflags
& HF_HALTED_MASK
))
173 return kvm_arch_has_work(env
);
176 static void kvm_main_loop_wait(CPUState
*env
, int timeout
)
183 pthread_mutex_unlock(&qemu_mutex
);
185 ts
.tv_sec
= timeout
/ 1000;
186 ts
.tv_nsec
= (timeout
% 1000) * 1000000;
187 sigemptyset(&waitset
);
188 sigaddset(&waitset
, SIG_IPI
);
190 r
= sigtimedwait(&waitset
, &siginfo
, &ts
);
193 pthread_mutex_lock(&qemu_mutex
);
195 if (r
== -1 && !(e
== EAGAIN
|| e
== EINTR
)) {
196 printf("sigtimedwait: %s\n", strerror(e
));
200 if (vcpu_info
[env
->cpu_index
].stop
) {
201 vcpu_info
[env
->cpu_index
].stop
= 0;
202 vcpu_info
[env
->cpu_index
].stopped
= 1;
203 pthread_cond_signal(&qemu_pause_cond
);
205 cpu_single_env
= env
;
207 vcpu_info
[env
->cpu_index
].signalled
= 0;
210 static int all_threads_paused(void)
214 for (i
= 0; i
< smp_cpus
; ++i
)
215 if (vcpu_info
[i
].stop
)
220 static void pause_all_threads(void)
224 assert(!cpu_single_env
);
226 for (i
= 0; i
< smp_cpus
; ++i
) {
227 vcpu_info
[i
].stop
= 1;
228 pthread_kill(vcpu_info
[i
].thread
, SIG_IPI
);
230 while (!all_threads_paused())
231 qemu_cond_wait(&qemu_pause_cond
);
234 static void resume_all_threads(void)
238 assert(!cpu_single_env
);
240 for (i
= 0; i
< smp_cpus
; ++i
) {
241 vcpu_info
[i
].stop
= 0;
242 vcpu_info
[i
].stopped
= 0;
243 pthread_kill(vcpu_info
[i
].thread
, SIG_IPI
);
247 static void kvm_vm_state_change_handler(void *context
, int running
)
250 resume_all_threads();
255 static void update_regs_for_sipi(CPUState
*env
)
257 kvm_arch_update_regs_for_sipi(env
);
258 vcpu_info
[env
->cpu_index
].sipi_needed
= 0;
259 vcpu_info
[env
->cpu_index
].init
= 0;
262 static void update_regs_for_init(CPUState
*env
)
265 kvm_arch_load_regs(env
);
268 static void setup_kernel_sigmask(CPUState
*env
)
273 sigaddset(&set
, SIGUSR2
);
274 sigaddset(&set
, SIGIO
);
275 sigaddset(&set
, SIGALRM
);
276 sigprocmask(SIG_BLOCK
, &set
, NULL
);
278 sigprocmask(SIG_BLOCK
, NULL
, &set
);
279 sigdelset(&set
, SIG_IPI
);
281 kvm_set_signal_mask(kvm_context
, env
->cpu_index
, &set
);
284 void qemu_kvm_system_reset(void)
292 for (i
= 0; i
< smp_cpus
; ++i
)
293 kvm_arch_cpu_reset(vcpu_info
[i
].env
);
295 resume_all_threads();
298 static int kvm_main_loop_cpu(CPUState
*env
)
300 struct vcpu_info
*info
= &vcpu_info
[env
->cpu_index
];
302 setup_kernel_sigmask(env
);
304 pthread_mutex_lock(&qemu_mutex
);
305 if (kvm_irqchip_in_kernel(kvm_context
))
306 env
->hflags
&= ~HF_HALTED_MASK
;
308 kvm_qemu_init_env(env
);
309 env
->ready_for_interrupt_injection
= 1;
311 kvm_tpr_vcpu_start(env
);
314 cpu_single_env
= env
;
316 while (!has_work(env
))
317 kvm_main_loop_wait(env
, 1000);
318 if (env
->interrupt_request
& CPU_INTERRUPT_HARD
)
319 env
->hflags
&= ~HF_HALTED_MASK
;
320 if (!kvm_irqchip_in_kernel(kvm_context
) && info
->sipi_needed
)
321 update_regs_for_sipi(env
);
322 if (!kvm_irqchip_in_kernel(kvm_context
) && info
->init
)
323 update_regs_for_init(env
);
324 if (!(env
->hflags
& HF_HALTED_MASK
) && !info
->init
)
326 env
->interrupt_request
&= ~CPU_INTERRUPT_EXIT
;
327 kvm_main_loop_wait(env
, 0);
329 pthread_mutex_unlock(&qemu_mutex
);
333 static void *ap_main_loop(void *_env
)
335 CPUState
*env
= _env
;
338 vcpu
= &vcpu_info
[env
->cpu_index
];
340 vcpu
->env
->thread_id
= kvm_get_thread_id();
341 sigfillset(&signals
);
342 sigprocmask(SIG_BLOCK
, &signals
, NULL
);
343 kvm_create_vcpu(kvm_context
, env
->cpu_index
);
344 kvm_qemu_init_env(env
);
346 /* signal VCPU creation */
347 pthread_mutex_lock(&qemu_mutex
);
349 pthread_cond_signal(&qemu_vcpu_cond
);
351 /* and wait for machine initialization */
352 while (!qemu_system_ready
)
353 qemu_cond_wait(&qemu_system_cond
);
354 pthread_mutex_unlock(&qemu_mutex
);
356 kvm_main_loop_cpu(env
);
360 void kvm_init_new_ap(int cpu
, CPUState
*env
)
362 pthread_create(&vcpu_info
[cpu
].thread
, NULL
, ap_main_loop
, env
);
364 while (vcpu_info
[cpu
].created
== 0)
365 qemu_cond_wait(&qemu_vcpu_cond
);
368 int kvm_init_ap(void)
373 qemu_add_vm_change_state_handler(kvm_vm_state_change_handler
, NULL
);
375 signal(SIG_IPI
, sig_ipi_handler
);
379 void qemu_kvm_notify_work(void)
385 if (io_thread_fd
== -1)
388 memcpy(buffer
, &value
, sizeof(value
));
393 len
= write(io_thread_fd
, buffer
+ offset
, 8 - offset
);
394 if (len
== -1 && errno
== EINTR
)
404 fprintf(stderr
, "failed to notify io thread\n");
407 /* If we have signalfd, we mask out the signals we want to handle and then
408 * use signalfd to listen for them. We rely on whatever the current signal
409 * handler is to dispatch the signals when we receive them.
412 static void sigfd_handler(void *opaque
)
414 int fd
= (unsigned long)opaque
;
415 struct signalfd_siginfo info
;
416 struct sigaction action
;
421 len
= read(fd
, &info
, sizeof(info
));
422 } while (len
== -1 && errno
== EINTR
);
424 if (len
== -1 && errno
== EAGAIN
)
427 if (len
!= sizeof(info
)) {
428 printf("read from sigfd returned %ld: %m\n", len
);
432 sigaction(info
.ssi_signo
, NULL
, &action
);
433 if (action
.sa_handler
)
434 action
.sa_handler(info
.ssi_signo
);
436 if (info
.ssi_signo
== SIGUSR2
) {
437 pthread_cond_signal(&qemu_aio_cond
);
442 /* Used to break IO thread out of select */
443 static void io_thread_wakeup(void *opaque
)
445 int fd
= (unsigned long)opaque
;
452 len
= read(fd
, buffer
+ offset
, 8 - offset
);
453 if (len
== -1 && errno
== EINTR
)
463 int kvm_main_loop(void)
469 io_thread
= pthread_self();
470 qemu_system_ready
= 1;
472 if (kvm_eventfd(fds
) == -1) {
473 fprintf(stderr
, "failed to create eventfd\n");
477 qemu_set_fd_handler2(fds
[0], NULL
, io_thread_wakeup
, NULL
,
478 (void *)(unsigned long)fds
[0]);
480 io_thread_fd
= fds
[1];
483 sigaddset(&mask
, SIGIO
);
484 sigaddset(&mask
, SIGALRM
);
485 sigaddset(&mask
, SIGUSR2
);
486 sigprocmask(SIG_BLOCK
, &mask
, NULL
);
488 sigfd
= kvm_signalfd(&mask
);
490 fprintf(stderr
, "failed to create signalfd\n");
494 fcntl(sigfd
, F_SETFL
, O_NONBLOCK
);
496 qemu_set_fd_handler2(sigfd
, NULL
, sigfd_handler
, NULL
,
497 (void *)(unsigned long)sigfd
);
499 pthread_cond_broadcast(&qemu_system_cond
);
501 io_thread_sigfd
= sigfd
;
502 cpu_single_env
= NULL
;
505 main_loop_wait(1000);
506 if (qemu_shutdown_requested())
508 else if (qemu_powerdown_requested())
509 qemu_system_powerdown();
510 else if (qemu_reset_requested())
511 qemu_kvm_system_reset();
515 pthread_mutex_unlock(&qemu_mutex
);
520 static int kvm_debug(void *opaque
, int vcpu
)
522 CPUState
*env
= cpu_single_env
;
524 env
->exception_index
= EXCP_DEBUG
;
528 static int kvm_inb(void *opaque
, uint16_t addr
, uint8_t *data
)
530 *data
= cpu_inb(0, addr
);
534 static int kvm_inw(void *opaque
, uint16_t addr
, uint16_t *data
)
536 *data
= cpu_inw(0, addr
);
540 static int kvm_inl(void *opaque
, uint16_t addr
, uint32_t *data
)
542 *data
= cpu_inl(0, addr
);
546 #define PM_IO_BASE 0xb000
548 static int kvm_outb(void *opaque
, uint16_t addr
, uint8_t data
)
553 cpu_outb(0, 0xb3, 0);
560 x
= cpu_inw(0, PM_IO_BASE
+ 4);
562 cpu_outw(0, PM_IO_BASE
+ 4, x
);
569 x
= cpu_inw(0, PM_IO_BASE
+ 4);
571 cpu_outw(0, PM_IO_BASE
+ 4, x
);
579 cpu_outb(0, addr
, data
);
583 static int kvm_outw(void *opaque
, uint16_t addr
, uint16_t data
)
585 cpu_outw(0, addr
, data
);
589 static int kvm_outl(void *opaque
, uint16_t addr
, uint32_t data
)
591 cpu_outl(0, addr
, data
);
595 static int kvm_mmio_read(void *opaque
, uint64_t addr
, uint8_t *data
, int len
)
597 cpu_physical_memory_rw(addr
, data
, len
, 0);
601 static int kvm_mmio_write(void *opaque
, uint64_t addr
, uint8_t *data
, int len
)
603 cpu_physical_memory_rw(addr
, data
, len
, 1);
607 static int kvm_io_window(void *opaque
)
613 static int kvm_halt(void *opaque
, int vcpu
)
615 return kvm_arch_halt(opaque
, vcpu
);
618 static int kvm_shutdown(void *opaque
, int vcpu
)
620 /* stop the current vcpu from going back to guest mode */
621 vcpu_info
[cpu_single_env
->cpu_index
].stopped
= 1;
623 qemu_system_reset_request();
627 static struct kvm_callbacks qemu_kvm_ops
= {
635 .mmio_read
= kvm_mmio_read
,
636 .mmio_write
= kvm_mmio_write
,
638 .shutdown
= kvm_shutdown
,
639 .io_window
= kvm_io_window
,
640 .try_push_interrupts
= try_push_interrupts
,
641 .post_kvm_run
= post_kvm_run
,
642 .pre_kvm_run
= pre_kvm_run
,
644 .tpr_access
= handle_tpr_access
,
647 .powerpc_dcr_read
= handle_powerpc_dcr_read
,
648 .powerpc_dcr_write
= handle_powerpc_dcr_write
,
654 /* Try to initialize kvm */
655 kvm_context
= kvm_init(&qemu_kvm_ops
, cpu_single_env
);
659 pthread_mutex_lock(&qemu_mutex
);
664 int kvm_qemu_create_context(void)
668 kvm_disable_irqchip_creation(kvm_context
);
671 kvm_disable_pit_creation(kvm_context
);
673 if (kvm_create(kvm_context
, phys_ram_size
, (void**)&phys_ram_base
) < 0) {
677 r
= kvm_arch_qemu_create_context();
683 void kvm_qemu_destroy(void)
685 kvm_finalize(kvm_context
);
688 void kvm_cpu_register_physical_memory(target_phys_addr_t start_addr
,
690 unsigned long phys_offset
)
692 #ifdef KVM_CAP_USER_MEMORY
695 r
= kvm_check_extension(kvm_context
, KVM_CAP_USER_MEMORY
);
697 if (!(phys_offset
& ~TARGET_PAGE_MASK
)) {
698 r
= kvm_is_allocated_mem(kvm_context
, start_addr
, size
);
701 r
= kvm_is_intersecting_mem(kvm_context
, start_addr
);
703 kvm_create_mem_hole(kvm_context
, start_addr
, size
);
704 r
= kvm_register_userspace_phys_mem(kvm_context
, start_addr
,
705 phys_ram_base
+ phys_offset
,
708 if (phys_offset
& IO_MEM_ROM
) {
709 phys_offset
&= ~IO_MEM_ROM
;
710 r
= kvm_is_intersecting_mem(kvm_context
, start_addr
);
712 kvm_create_mem_hole(kvm_context
, start_addr
, size
);
713 r
= kvm_register_userspace_phys_mem(kvm_context
, start_addr
,
714 phys_ram_base
+ phys_offset
,
718 printf("kvm_cpu_register_physical_memory: failed\n");
724 if (phys_offset
& IO_MEM_ROM
) {
725 phys_offset
&= ~IO_MEM_ROM
;
726 memcpy(phys_ram_base
+ start_addr
, phys_ram_base
+ phys_offset
, size
);
730 int kvm_qemu_check_extension(int ext
)
732 return kvm_check_extension(kvm_context
, ext
);
735 int kvm_qemu_init_env(CPUState
*cenv
)
737 return kvm_arch_qemu_init_env(cenv
);
740 int kvm_update_debugger(CPUState
*env
)
742 struct kvm_debug_guest dbg
;
745 memset(dbg
.breakpoints
, 0, sizeof(dbg
.breakpoints
));
748 if (env
->nb_breakpoints
|| env
->singlestep_enabled
) {
750 for (i
= 0; i
< 4 && i
< env
->nb_breakpoints
; ++i
) {
751 dbg
.breakpoints
[i
].enabled
= 1;
752 dbg
.breakpoints
[i
].address
= env
->breakpoints
[i
];
754 dbg
.singlestep
= env
->singlestep_enabled
;
756 return kvm_guest_debug(kvm_context
, env
->cpu_index
, &dbg
);
761 * dirty pages logging
763 /* FIXME: use unsigned long pointer instead of unsigned char */
764 unsigned char *kvm_dirty_bitmap
= NULL
;
765 int kvm_physical_memory_set_dirty_tracking(int enable
)
773 if (!kvm_dirty_bitmap
) {
774 unsigned bitmap_size
= BITMAP_SIZE(phys_ram_size
);
775 kvm_dirty_bitmap
= qemu_malloc(bitmap_size
);
776 if (kvm_dirty_bitmap
== NULL
) {
777 perror("Failed to allocate dirty pages bitmap");
781 r
= kvm_dirty_pages_log_enable_all(kvm_context
);
786 if (kvm_dirty_bitmap
) {
787 r
= kvm_dirty_pages_log_reset(kvm_context
);
788 qemu_free(kvm_dirty_bitmap
);
789 kvm_dirty_bitmap
= NULL
;
795 /* get kvm's dirty pages bitmap and update qemu's */
796 int kvm_get_dirty_pages_log_range(unsigned long start_addr
,
797 unsigned char *bitmap
,
799 unsigned long mem_size
)
801 unsigned int i
, j
, n
=0;
803 unsigned page_number
, addr
, addr1
;
804 unsigned int len
= ((mem_size
/TARGET_PAGE_SIZE
) + 7) / 8;
807 * bitmap-traveling is faster than memory-traveling (for addr...)
808 * especially when most of the memory is not dirty.
810 for (i
=0; i
<len
; i
++) {
815 page_number
= i
* 8 + j
;
816 addr1
= page_number
* TARGET_PAGE_SIZE
;
817 addr
= offset
+ addr1
;
818 cpu_physical_memory_set_dirty(addr
);
824 int kvm_get_dirty_bitmap_cb(unsigned long start
, unsigned long len
,
825 void *bitmap
, void *opaque
)
827 return kvm_get_dirty_pages_log_range(start
, bitmap
, start
, len
);
831 * get kvm's dirty pages bitmap and update qemu's
832 * we only care about physical ram, which resides in slots 0 and 3
834 int kvm_update_dirty_pages_log(void)
839 r
= kvm_get_dirty_pages_range(kvm_context
, 0, phys_ram_size
,
840 kvm_dirty_bitmap
, NULL
,
841 kvm_get_dirty_bitmap_cb
);
845 int kvm_get_phys_ram_page_bitmap(unsigned char *bitmap
)
847 unsigned int bsize
= BITMAP_SIZE(phys_ram_size
);
848 unsigned int brsize
= BITMAP_SIZE(ram_size
);
849 unsigned int extra_pages
= (phys_ram_size
- ram_size
) / TARGET_PAGE_SIZE
;
850 unsigned int extra_bytes
= (extra_pages
+7)/8;
851 unsigned int hole_start
= BITMAP_SIZE(0xa0000);
852 unsigned int hole_end
= BITMAP_SIZE(0xc0000);
854 memset(bitmap
, 0xFF, brsize
+ extra_bytes
);
855 memset(bitmap
+ hole_start
, 0, hole_end
- hole_start
);
856 memset(bitmap
+ brsize
+ extra_bytes
, 0, bsize
- brsize
- extra_bytes
);
861 #ifdef KVM_CAP_IRQCHIP
863 int kvm_set_irq(int irq
, int level
)
865 return kvm_set_irq_level(kvm_context
, irq
, level
);
870 void qemu_kvm_aio_wait_start(void)
874 void qemu_kvm_aio_wait(void)
876 if (!cpu_single_env
) {
877 if (io_thread_sigfd
!= -1) {
882 FD_SET(io_thread_sigfd
, &rfds
);
884 /* this is a rare case where we do want to hold qemu_mutex
885 * while sleeping. We cannot allow anything else to run
887 ret
= select(io_thread_sigfd
+ 1, &rfds
, NULL
, NULL
, NULL
);
888 if (ret
> 0 && FD_ISSET(io_thread_sigfd
, &rfds
))
889 sigfd_handler((void *)(unsigned long)io_thread_sigfd
);
893 qemu_cond_wait(&qemu_aio_cond
);
896 void qemu_kvm_aio_wait_end(void)
900 int qemu_kvm_get_dirty_pages(unsigned long phys_addr
, void *buf
)
902 return kvm_get_dirty_pages(kvm_context
, phys_addr
, buf
);
905 void *kvm_cpu_create_phys_mem(target_phys_addr_t start_addr
,
906 unsigned long size
, int log
, int writable
)
908 return kvm_create_phys_mem(kvm_context
, start_addr
, size
, log
, writable
);
911 void kvm_cpu_destroy_phys_mem(target_phys_addr_t start_addr
,
914 kvm_destroy_phys_mem(kvm_context
, start_addr
, size
);
917 void kvm_mutex_unlock(void)
919 assert(!cpu_single_env
);
920 pthread_mutex_unlock(&qemu_mutex
);
923 void kvm_mutex_lock(void)
925 pthread_mutex_lock(&qemu_mutex
);
926 cpu_single_env
= NULL
;