Don't quit IO thread before all vcpus are stopped
[qemu-kvm/fedora.git] / qemu-kvm.c
blob78127dee0d2564b5fbac087828096b83f1464f7e
1 /*
2 * qemu/kvm integration
4 * Copyright (C) 2006-2008 Qumranet Technologies
6 * Licensed under the terms of the GNU GPL version 2 or higher.
7 */
8 #include "config.h"
9 #include "config-host.h"
11 int kvm_allowed = 1;
12 int kvm_irqchip = 1;
13 int kvm_pit = 1;
15 #include <string.h>
16 #include "hw/hw.h"
17 #include "sysemu.h"
19 #include "qemu-kvm.h"
20 #include <libkvm.h>
21 #include <pthread.h>
22 #include <sys/utsname.h>
23 #include <sys/syscall.h>
25 extern void perror(const char *s);
27 kvm_context_t kvm_context;
29 extern int smp_cpus;
31 static int qemu_kvm_reset_requested;
33 pthread_mutex_t qemu_mutex = PTHREAD_MUTEX_INITIALIZER;
34 pthread_cond_t qemu_aio_cond = PTHREAD_COND_INITIALIZER;
35 __thread struct vcpu_info *vcpu;
37 struct qemu_kvm_signal_table {
38 sigset_t sigset;
39 sigset_t negsigset;
42 static struct qemu_kvm_signal_table io_signal_table;
43 static struct qemu_kvm_signal_table vcpu_signal_table;
45 #define SIG_IPI (SIGRTMIN+4)
47 struct vcpu_info {
48 CPUState *env;
49 int sipi_needed;
50 int init;
51 pthread_t thread;
52 int signalled;
53 int stop;
54 int stopped;
55 int reload_regs;
56 } vcpu_info[256];
58 pthread_t io_thread;
60 static inline unsigned long kvm_get_thread_id(void)
62 return syscall(SYS_gettid);
65 CPUState *qemu_kvm_cpu_env(int index)
67 return vcpu_info[index].env;
70 static void sig_ipi_handler(int n)
74 void kvm_update_interrupt_request(CPUState *env)
76 int signal = 0;
78 if (env) {
79 if (!vcpu)
80 signal = 1;
81 if (vcpu && env != vcpu->env && !vcpu_info[env->cpu_index].signalled)
82 signal = 1;
84 if (signal) {
85 vcpu_info[env->cpu_index].signalled = 1;
86 if (vcpu_info[env->cpu_index].thread)
87 pthread_kill(vcpu_info[env->cpu_index].thread, SIG_IPI);
92 void kvm_update_after_sipi(CPUState *env)
94 vcpu_info[env->cpu_index].sipi_needed = 1;
95 kvm_update_interrupt_request(env);
98 void kvm_apic_init(CPUState *env)
100 if (env->cpu_index != 0)
101 vcpu_info[env->cpu_index].init = 1;
102 kvm_update_interrupt_request(env);
105 #include <signal.h>
107 static int try_push_interrupts(void *opaque)
109 return kvm_arch_try_push_interrupts(opaque);
112 static void post_kvm_run(void *opaque, int vcpu)
115 pthread_mutex_lock(&qemu_mutex);
116 kvm_arch_post_kvm_run(opaque, vcpu);
119 static int pre_kvm_run(void *opaque, int vcpu)
121 CPUState *env = qemu_kvm_cpu_env(vcpu);
123 kvm_arch_pre_kvm_run(opaque, vcpu);
125 if (env->interrupt_request & CPU_INTERRUPT_EXIT)
126 return 1;
127 pthread_mutex_unlock(&qemu_mutex);
128 return 0;
131 void kvm_load_registers(CPUState *env)
133 if (kvm_enabled())
134 kvm_arch_load_regs(env);
137 void kvm_save_registers(CPUState *env)
139 if (kvm_enabled())
140 kvm_arch_save_regs(env);
143 int kvm_cpu_exec(CPUState *env)
145 int r;
147 r = kvm_run(kvm_context, env->cpu_index);
148 if (r < 0) {
149 printf("kvm_run returned %d\n", r);
150 exit(1);
153 return 0;
156 extern int vm_running;
158 static int has_work(CPUState *env)
160 if (!vm_running || (env && vcpu_info[env->cpu_index].stopped))
161 return 0;
162 if (!(env->hflags & HF_HALTED_MASK))
163 return 1;
164 return kvm_arch_has_work(env);
167 static int kvm_process_signal(int si_signo)
169 struct sigaction sa;
171 switch (si_signo) {
172 case SIGUSR2:
173 pthread_cond_signal(&qemu_aio_cond);
174 break;
175 case SIGALRM:
176 case SIGIO:
177 sigaction(si_signo, NULL, &sa);
178 sa.sa_handler(si_signo);
179 break;
182 return 1;
185 static int kvm_eat_signal(struct qemu_kvm_signal_table *waitset, CPUState *env,
186 int timeout)
188 struct timespec ts;
189 int r, e, ret = 0;
190 siginfo_t siginfo;
192 ts.tv_sec = timeout / 1000;
193 ts.tv_nsec = (timeout % 1000) * 1000000;
194 r = sigtimedwait(&waitset->sigset, &siginfo, &ts);
195 if (r == -1 && (errno == EAGAIN || errno == EINTR) && !timeout)
196 return 0;
197 e = errno;
198 pthread_mutex_lock(&qemu_mutex);
199 if (env && vcpu)
200 cpu_single_env = vcpu->env;
201 if (r == -1 && !(errno == EAGAIN || errno == EINTR)) {
202 printf("sigtimedwait: %s\n", strerror(e));
203 exit(1);
205 if (r != -1)
206 ret = kvm_process_signal(siginfo.si_signo);
208 if (env && vcpu_info[env->cpu_index].stop) {
209 vcpu_info[env->cpu_index].stop = 0;
210 vcpu_info[env->cpu_index].stopped = 1;
211 pthread_kill(io_thread, SIGUSR1);
213 pthread_mutex_unlock(&qemu_mutex);
215 return ret;
219 static void kvm_eat_signals(CPUState *env, int timeout)
221 int r = 0;
222 struct qemu_kvm_signal_table *waitset = &vcpu_signal_table;
224 while (kvm_eat_signal(waitset, env, 0))
225 r = 1;
226 if (!r && timeout) {
227 r = kvm_eat_signal(waitset, env, timeout);
228 if (r)
229 while (kvm_eat_signal(waitset, env, 0))
234 static void kvm_main_loop_wait(CPUState *env, int timeout)
236 pthread_mutex_unlock(&qemu_mutex);
237 kvm_eat_signals(env, timeout);
238 pthread_mutex_lock(&qemu_mutex);
239 cpu_single_env = env;
240 vcpu_info[env->cpu_index].signalled = 0;
243 static int all_threads_paused(void)
245 int i;
247 for (i = 0; i < smp_cpus; ++i)
248 if (vcpu_info[i].stop)
249 return 0;
250 return 1;
253 static void pause_all_threads(void)
255 int i;
257 for (i = 0; i < smp_cpus; ++i) {
258 vcpu_info[i].stop = 1;
259 pthread_kill(vcpu_info[i].thread, SIG_IPI);
261 while (!all_threads_paused()) {
262 pthread_mutex_unlock(&qemu_mutex);
263 kvm_eat_signal(&io_signal_table, NULL, 1000);
264 pthread_mutex_lock(&qemu_mutex);
265 cpu_single_env = NULL;
269 static void resume_all_threads(void)
271 int i;
273 for (i = 0; i < smp_cpus; ++i) {
274 vcpu_info[i].stop = 0;
275 vcpu_info[i].stopped = 0;
276 pthread_kill(vcpu_info[i].thread, SIG_IPI);
280 static void kvm_vm_state_change_handler(void *context, int running)
282 if (running)
283 resume_all_threads();
284 else
285 pause_all_threads();
288 static void update_regs_for_sipi(CPUState *env)
290 kvm_arch_update_regs_for_sipi(env);
291 vcpu_info[env->cpu_index].sipi_needed = 0;
292 vcpu_info[env->cpu_index].init = 0;
295 static void update_regs_for_init(CPUState *env)
297 cpu_reset(env);
298 kvm_arch_load_regs(env);
301 static void setup_kernel_sigmask(CPUState *env)
303 sigset_t set;
305 sigprocmask(SIG_BLOCK, NULL, &set);
306 sigdelset(&set, SIG_IPI);
308 kvm_set_signal_mask(kvm_context, env->cpu_index, &set);
311 void qemu_kvm_system_reset_request(void)
313 int i;
315 for (i = 0; i < smp_cpus; ++i) {
316 vcpu_info[i].reload_regs = 1;
317 pthread_kill(vcpu_info[i].thread, SIG_IPI);
319 qemu_system_reset();
322 static int kvm_main_loop_cpu(CPUState *env)
324 struct vcpu_info *info = &vcpu_info[env->cpu_index];
326 setup_kernel_sigmask(env);
327 pthread_mutex_lock(&qemu_mutex);
328 if (kvm_irqchip_in_kernel(kvm_context))
329 env->hflags &= ~HF_HALTED_MASK;
331 kvm_qemu_init_env(env);
332 env->ready_for_interrupt_injection = 1;
333 #ifdef TARGET_I386
334 kvm_tpr_vcpu_start(env);
335 #endif
337 cpu_single_env = env;
338 while (1) {
339 while (!has_work(env))
340 kvm_main_loop_wait(env, 10);
341 if (env->interrupt_request & CPU_INTERRUPT_HARD)
342 env->hflags &= ~HF_HALTED_MASK;
343 if (!kvm_irqchip_in_kernel(kvm_context) && info->sipi_needed)
344 update_regs_for_sipi(env);
345 if (!kvm_irqchip_in_kernel(kvm_context) && info->init)
346 update_regs_for_init(env);
347 if (!(env->hflags & HF_HALTED_MASK) && !info->init)
348 kvm_cpu_exec(env);
349 env->interrupt_request &= ~CPU_INTERRUPT_EXIT;
350 kvm_main_loop_wait(env, 0);
351 if (info->reload_regs) {
352 info->reload_regs = 0;
353 if (env->cpu_index == 0) /* ap needs to be placed in INIT */
354 kvm_arch_load_regs(env);
357 pthread_mutex_unlock(&qemu_mutex);
358 return 0;
361 static void *ap_main_loop(void *_env)
363 CPUState *env = _env;
364 sigset_t signals;
366 vcpu = &vcpu_info[env->cpu_index];
367 vcpu->env = env;
368 vcpu->env->thread_id = kvm_get_thread_id();
369 sigfillset(&signals);
370 sigprocmask(SIG_BLOCK, &signals, NULL);
371 kvm_create_vcpu(kvm_context, env->cpu_index);
372 kvm_qemu_init_env(env);
373 kvm_main_loop_cpu(env);
374 return NULL;
377 static void qemu_kvm_init_signal_table(struct qemu_kvm_signal_table *sigtab)
379 sigemptyset(&sigtab->sigset);
380 sigfillset(&sigtab->negsigset);
383 static void kvm_add_signal(struct qemu_kvm_signal_table *sigtab, int signum)
385 sigaddset(&sigtab->sigset, signum);
386 sigdelset(&sigtab->negsigset, signum);
389 void kvm_init_new_ap(int cpu, CPUState *env)
391 pthread_create(&vcpu_info[cpu].thread, NULL, ap_main_loop, env);
392 /* FIXME: wait for thread to spin up */
393 usleep(200);
396 static void qemu_kvm_init_signal_tables(void)
398 qemu_kvm_init_signal_table(&io_signal_table);
399 qemu_kvm_init_signal_table(&vcpu_signal_table);
401 kvm_add_signal(&io_signal_table, SIGIO);
402 kvm_add_signal(&io_signal_table, SIGALRM);
403 kvm_add_signal(&io_signal_table, SIGUSR1);
404 kvm_add_signal(&io_signal_table, SIGUSR2);
406 kvm_add_signal(&vcpu_signal_table, SIG_IPI);
408 sigprocmask(SIG_BLOCK, &io_signal_table.sigset, NULL);
411 int kvm_init_ap(void)
413 #ifdef TARGET_I386
414 kvm_tpr_opt_setup();
415 #endif
416 qemu_add_vm_change_state_handler(kvm_vm_state_change_handler, NULL);
417 qemu_kvm_init_signal_tables();
419 signal(SIG_IPI, sig_ipi_handler);
420 return 0;
423 void qemu_kvm_notify_work(void)
425 if (io_thread)
426 pthread_kill(io_thread, SIGUSR1);
430 * The IO thread has all signals that inform machine events
431 * blocked (io_signal_table), so it won't get interrupted
432 * while processing in main_loop_wait().
435 int kvm_main_loop(void)
437 io_thread = pthread_self();
438 pthread_mutex_unlock(&qemu_mutex);
439 while (1) {
440 kvm_eat_signal(&io_signal_table, NULL, 1000);
441 pthread_mutex_lock(&qemu_mutex);
442 cpu_single_env = NULL;
443 main_loop_wait(0);
444 if (qemu_shutdown_requested())
445 break;
446 else if (qemu_powerdown_requested())
447 qemu_system_powerdown();
448 else if (qemu_reset_requested()) {
449 pthread_kill(vcpu_info[0].thread, SIG_IPI);
450 qemu_kvm_reset_requested = 1;
452 pthread_mutex_unlock(&qemu_mutex);
455 pause_all_threads();
456 pthread_mutex_unlock(&qemu_mutex);
458 return 0;
461 static int kvm_debug(void *opaque, int vcpu)
463 CPUState *env = cpu_single_env;
465 env->exception_index = EXCP_DEBUG;
466 return 1;
469 static int kvm_inb(void *opaque, uint16_t addr, uint8_t *data)
471 *data = cpu_inb(0, addr);
472 return 0;
475 static int kvm_inw(void *opaque, uint16_t addr, uint16_t *data)
477 *data = cpu_inw(0, addr);
478 return 0;
481 static int kvm_inl(void *opaque, uint16_t addr, uint32_t *data)
483 *data = cpu_inl(0, addr);
484 return 0;
487 #define PM_IO_BASE 0xb000
489 static int kvm_outb(void *opaque, uint16_t addr, uint8_t data)
491 if (addr == 0xb2) {
492 switch (data) {
493 case 0: {
494 cpu_outb(0, 0xb3, 0);
495 break;
497 case 0xf0: {
498 unsigned x;
500 /* enable acpi */
501 x = cpu_inw(0, PM_IO_BASE + 4);
502 x &= ~1;
503 cpu_outw(0, PM_IO_BASE + 4, x);
504 break;
506 case 0xf1: {
507 unsigned x;
509 /* enable acpi */
510 x = cpu_inw(0, PM_IO_BASE + 4);
511 x |= 1;
512 cpu_outw(0, PM_IO_BASE + 4, x);
513 break;
515 default:
516 break;
518 return 0;
520 cpu_outb(0, addr, data);
521 return 0;
524 static int kvm_outw(void *opaque, uint16_t addr, uint16_t data)
526 cpu_outw(0, addr, data);
527 return 0;
530 static int kvm_outl(void *opaque, uint16_t addr, uint32_t data)
532 cpu_outl(0, addr, data);
533 return 0;
536 static int kvm_mmio_read(void *opaque, uint64_t addr, uint8_t *data, int len)
538 cpu_physical_memory_rw(addr, data, len, 0);
539 return 0;
542 static int kvm_mmio_write(void *opaque, uint64_t addr, uint8_t *data, int len)
544 cpu_physical_memory_rw(addr, data, len, 1);
545 return 0;
548 static int kvm_io_window(void *opaque)
550 return 1;
554 static int kvm_halt(void *opaque, int vcpu)
556 return kvm_arch_halt(opaque, vcpu);
559 static int kvm_shutdown(void *opaque, int vcpu)
561 qemu_system_reset_request();
562 return 1;
565 static struct kvm_callbacks qemu_kvm_ops = {
566 .debug = kvm_debug,
567 .inb = kvm_inb,
568 .inw = kvm_inw,
569 .inl = kvm_inl,
570 .outb = kvm_outb,
571 .outw = kvm_outw,
572 .outl = kvm_outl,
573 .mmio_read = kvm_mmio_read,
574 .mmio_write = kvm_mmio_write,
575 .halt = kvm_halt,
576 .shutdown = kvm_shutdown,
577 .io_window = kvm_io_window,
578 .try_push_interrupts = try_push_interrupts,
579 .post_kvm_run = post_kvm_run,
580 .pre_kvm_run = pre_kvm_run,
581 #ifdef TARGET_I386
582 .tpr_access = handle_tpr_access,
583 #endif
584 #ifdef TARGET_PPC
585 .powerpc_dcr_read = handle_powerpc_dcr_read,
586 .powerpc_dcr_write = handle_powerpc_dcr_write,
587 #endif
590 int kvm_qemu_init()
592 /* Try to initialize kvm */
593 kvm_context = kvm_init(&qemu_kvm_ops, cpu_single_env);
594 if (!kvm_context) {
595 return -1;
597 pthread_mutex_lock(&qemu_mutex);
599 return 0;
602 int kvm_qemu_create_context(void)
604 int r;
605 if (!kvm_irqchip) {
606 kvm_disable_irqchip_creation(kvm_context);
608 if (!kvm_pit) {
609 kvm_disable_pit_creation(kvm_context);
611 if (kvm_create(kvm_context, phys_ram_size, (void**)&phys_ram_base) < 0) {
612 kvm_qemu_destroy();
613 return -1;
615 r = kvm_arch_qemu_create_context();
616 if(r <0)
617 kvm_qemu_destroy();
618 return 0;
621 void kvm_qemu_destroy(void)
623 kvm_finalize(kvm_context);
626 void kvm_cpu_register_physical_memory(target_phys_addr_t start_addr,
627 unsigned long size,
628 unsigned long phys_offset)
630 #ifdef KVM_CAP_USER_MEMORY
631 int r = 0;
633 r = kvm_check_extension(kvm_context, KVM_CAP_USER_MEMORY);
634 if (r) {
635 if (!(phys_offset & ~TARGET_PAGE_MASK)) {
636 r = kvm_is_allocated_mem(kvm_context, start_addr, size);
637 if (r)
638 return;
639 r = kvm_is_intersecting_mem(kvm_context, start_addr);
640 if (r)
641 kvm_create_mem_hole(kvm_context, start_addr, size);
642 r = kvm_register_userspace_phys_mem(kvm_context, start_addr,
643 phys_ram_base + phys_offset,
644 size, 0);
646 if (phys_offset & IO_MEM_ROM) {
647 phys_offset &= ~IO_MEM_ROM;
648 r = kvm_is_intersecting_mem(kvm_context, start_addr);
649 if (r)
650 kvm_create_mem_hole(kvm_context, start_addr, size);
651 r = kvm_register_userspace_phys_mem(kvm_context, start_addr,
652 phys_ram_base + phys_offset,
653 size, 0);
655 if (r < 0) {
656 printf("kvm_cpu_register_physical_memory: failed\n");
657 exit(1);
659 return;
661 #endif
662 if (phys_offset & IO_MEM_ROM) {
663 phys_offset &= ~IO_MEM_ROM;
664 memcpy(phys_ram_base + start_addr, phys_ram_base + phys_offset, size);
668 int kvm_qemu_check_extension(int ext)
670 return kvm_check_extension(kvm_context, ext);
673 int kvm_qemu_init_env(CPUState *cenv)
675 return kvm_arch_qemu_init_env(cenv);
678 int kvm_update_debugger(CPUState *env)
680 struct kvm_debug_guest dbg;
681 int i;
683 dbg.enabled = 0;
684 if (env->nb_breakpoints || env->singlestep_enabled) {
685 dbg.enabled = 1;
686 for (i = 0; i < 4 && i < env->nb_breakpoints; ++i) {
687 dbg.breakpoints[i].enabled = 1;
688 dbg.breakpoints[i].address = env->breakpoints[i];
690 dbg.singlestep = env->singlestep_enabled;
692 return kvm_guest_debug(kvm_context, env->cpu_index, &dbg);
697 * dirty pages logging
699 /* FIXME: use unsigned long pointer instead of unsigned char */
700 unsigned char *kvm_dirty_bitmap = NULL;
701 int kvm_physical_memory_set_dirty_tracking(int enable)
703 int r = 0;
705 if (!kvm_enabled())
706 return 0;
708 if (enable) {
709 if (!kvm_dirty_bitmap) {
710 unsigned bitmap_size = BITMAP_SIZE(phys_ram_size);
711 kvm_dirty_bitmap = qemu_malloc(bitmap_size);
712 if (kvm_dirty_bitmap == NULL) {
713 perror("Failed to allocate dirty pages bitmap");
714 r=-1;
716 else {
717 r = kvm_dirty_pages_log_enable_all(kvm_context);
721 else {
722 if (kvm_dirty_bitmap) {
723 r = kvm_dirty_pages_log_reset(kvm_context);
724 qemu_free(kvm_dirty_bitmap);
725 kvm_dirty_bitmap = NULL;
728 return r;
731 /* get kvm's dirty pages bitmap and update qemu's */
732 int kvm_get_dirty_pages_log_range(unsigned long start_addr,
733 unsigned char *bitmap,
734 unsigned int offset,
735 unsigned long mem_size)
737 unsigned int i, j, n=0;
738 unsigned char c;
739 unsigned page_number, addr, addr1;
740 unsigned int len = ((mem_size/TARGET_PAGE_SIZE) + 7) / 8;
743 * bitmap-traveling is faster than memory-traveling (for addr...)
744 * especially when most of the memory is not dirty.
746 for (i=0; i<len; i++) {
747 c = bitmap[i];
748 while (c>0) {
749 j = ffsl(c) - 1;
750 c &= ~(1u<<j);
751 page_number = i * 8 + j;
752 addr1 = page_number * TARGET_PAGE_SIZE;
753 addr = offset + addr1;
754 cpu_physical_memory_set_dirty(addr);
755 n++;
758 return 0;
760 int kvm_get_dirty_bitmap_cb(unsigned long start, unsigned long len,
761 void *bitmap, void *opaque)
763 return kvm_get_dirty_pages_log_range(start, bitmap, start, len);
767 * get kvm's dirty pages bitmap and update qemu's
768 * we only care about physical ram, which resides in slots 0 and 3
770 int kvm_update_dirty_pages_log(void)
772 int r = 0;
775 r = kvm_get_dirty_pages_range(kvm_context, 0, phys_ram_size,
776 kvm_dirty_bitmap, NULL,
777 kvm_get_dirty_bitmap_cb);
778 return r;
781 int kvm_get_phys_ram_page_bitmap(unsigned char *bitmap)
783 unsigned int bsize = BITMAP_SIZE(phys_ram_size);
784 unsigned int brsize = BITMAP_SIZE(ram_size);
785 unsigned int extra_pages = (phys_ram_size - ram_size) / TARGET_PAGE_SIZE;
786 unsigned int extra_bytes = (extra_pages +7)/8;
787 unsigned int hole_start = BITMAP_SIZE(0xa0000);
788 unsigned int hole_end = BITMAP_SIZE(0xc0000);
790 memset(bitmap, 0xFF, brsize + extra_bytes);
791 memset(bitmap + hole_start, 0, hole_end - hole_start);
792 memset(bitmap + brsize + extra_bytes, 0, bsize - brsize - extra_bytes);
794 return 0;
797 #ifdef KVM_CAP_IRQCHIP
799 int kvm_set_irq(int irq, int level)
801 return kvm_set_irq_level(kvm_context, irq, level);
804 #endif
806 void qemu_kvm_aio_wait_start(void)
810 void qemu_kvm_aio_wait(void)
812 CPUState *cpu_single = cpu_single_env;
814 if (!cpu_single_env) {
815 pthread_mutex_unlock(&qemu_mutex);
816 kvm_eat_signal(&io_signal_table, NULL, 1000);
817 pthread_mutex_lock(&qemu_mutex);
818 cpu_single_env = NULL;
819 } else {
820 pthread_cond_wait(&qemu_aio_cond, &qemu_mutex);
821 cpu_single_env = cpu_single;
825 void qemu_kvm_aio_wait_end(void)
829 int qemu_kvm_get_dirty_pages(unsigned long phys_addr, void *buf)
831 return kvm_get_dirty_pages(kvm_context, phys_addr, buf);
834 void *kvm_cpu_create_phys_mem(target_phys_addr_t start_addr,
835 unsigned long size, int log, int writable)
837 return kvm_create_phys_mem(kvm_context, start_addr, size, log, writable);
840 void kvm_cpu_destroy_phys_mem(target_phys_addr_t start_addr,
841 unsigned long size)
843 kvm_destroy_phys_mem(kvm_context, start_addr, size);