kvm: libkvm: export a new function to disable irqchip creation
[qemu-kvm/fedora.git] / qemu-kvm.c
blob16b46e77eb856ad1b32401f8a78c3e15290bb63e
2 #include "config.h"
3 #include "config-host.h"
5 #ifdef USE_KVM
6 #define KVM_ALLOWED_DEFAULT 1
7 #else
8 #define KVM_ALLOWED_DEFAULT 0
9 #endif
11 int kvm_allowed = KVM_ALLOWED_DEFAULT;
12 static int lm_capable_kernel;
14 #ifdef USE_KVM
16 #include <string.h>
17 #include "vl.h"
19 #include "qemu-kvm.h"
20 #include <kvmctl.h>
21 #include <pthread.h>
22 #include <sys/utsname.h>
24 #define MSR_IA32_TSC 0x10
26 extern void perror(const char *s);
28 kvm_context_t kvm_context;
29 static struct kvm_msr_list *kvm_msr_list;
30 static int kvm_has_msr_star;
32 extern int smp_cpus;
34 pthread_mutex_t qemu_mutex = PTHREAD_MUTEX_INITIALIZER;
35 static __thread CPUState *vcpu_env;
37 static sigset_t io_sigset, io_negsigset;
39 static int wait_hack;
41 #define SIG_IPI (SIGRTMIN+4)
43 struct vcpu_info {
44 int sipi_needed;
45 int init;
46 pthread_t thread;
47 int signalled;
48 int stop;
49 int stopped;
50 } vcpu_info[4];
52 static void sig_ipi_handler(int n)
56 void kvm_update_interrupt_request(CPUState *env)
58 if (env && env != vcpu_env) {
59 if (vcpu_info[env->cpu_index].signalled)
60 return;
61 vcpu_info[env->cpu_index].signalled = 1;
62 if (vcpu_info[env->cpu_index].thread)
63 pthread_kill(vcpu_info[env->cpu_index].thread, SIG_IPI);
67 void kvm_update_after_sipi(CPUState *env)
69 vcpu_info[env->cpu_index].sipi_needed = 1;
70 kvm_update_interrupt_request(env);
73 * the qemu bios waits using a busy loop that's much too short for
74 * kvm. add a wait after the first sipi.
77 static int first_sipi = 1;
79 if (first_sipi) {
80 wait_hack = 1;
81 first_sipi = 0;
86 void kvm_apic_init(CPUState *env)
88 vcpu_info[env->cpu_index].init = 1;
89 kvm_update_interrupt_request(env);
92 static void set_msr_entry(struct kvm_msr_entry *entry, uint32_t index,
93 uint64_t data)
95 entry->index = index;
96 entry->data = data;
99 /* returns 0 on success, non-0 on failure */
100 static int get_msr_entry(struct kvm_msr_entry *entry, CPUState *env)
102 switch (entry->index) {
103 case MSR_IA32_SYSENTER_CS:
104 env->sysenter_cs = entry->data;
105 break;
106 case MSR_IA32_SYSENTER_ESP:
107 env->sysenter_esp = entry->data;
108 break;
109 case MSR_IA32_SYSENTER_EIP:
110 env->sysenter_eip = entry->data;
111 break;
112 case MSR_STAR:
113 env->star = entry->data;
114 break;
115 #ifdef TARGET_X86_64
116 case MSR_CSTAR:
117 env->cstar = entry->data;
118 break;
119 case MSR_KERNELGSBASE:
120 env->kernelgsbase = entry->data;
121 break;
122 case MSR_FMASK:
123 env->fmask = entry->data;
124 break;
125 case MSR_LSTAR:
126 env->lstar = entry->data;
127 break;
128 #endif
129 case MSR_IA32_TSC:
130 env->tsc = entry->data;
131 break;
132 default:
133 printf("Warning unknown msr index 0x%x\n", entry->index);
134 return 1;
136 return 0;
139 #ifdef TARGET_X86_64
140 #define MSR_COUNT 9
141 #else
142 #define MSR_COUNT 5
143 #endif
145 static void set_v8086_seg(struct kvm_segment *lhs, const SegmentCache *rhs)
147 lhs->selector = rhs->selector;
148 lhs->base = rhs->base;
149 lhs->limit = rhs->limit;
150 lhs->type = 3;
151 lhs->present = 1;
152 lhs->dpl = 3;
153 lhs->db = 0;
154 lhs->s = 1;
155 lhs->l = 0;
156 lhs->g = 0;
157 lhs->avl = 0;
158 lhs->unusable = 0;
161 static void set_seg(struct kvm_segment *lhs, const SegmentCache *rhs)
163 unsigned flags = rhs->flags;
164 lhs->selector = rhs->selector;
165 lhs->base = rhs->base;
166 lhs->limit = rhs->limit;
167 lhs->type = (flags >> DESC_TYPE_SHIFT) & 15;
168 lhs->present = (flags & DESC_P_MASK) != 0;
169 lhs->dpl = rhs->selector & 3;
170 lhs->db = (flags >> DESC_B_SHIFT) & 1;
171 lhs->s = (flags & DESC_S_MASK) != 0;
172 lhs->l = (flags >> DESC_L_SHIFT) & 1;
173 lhs->g = (flags & DESC_G_MASK) != 0;
174 lhs->avl = (flags & DESC_AVL_MASK) != 0;
175 lhs->unusable = 0;
178 static void get_seg(SegmentCache *lhs, const struct kvm_segment *rhs)
180 lhs->selector = rhs->selector;
181 lhs->base = rhs->base;
182 lhs->limit = rhs->limit;
183 lhs->flags =
184 (rhs->type << DESC_TYPE_SHIFT)
185 | (rhs->present * DESC_P_MASK)
186 | (rhs->dpl << DESC_DPL_SHIFT)
187 | (rhs->db << DESC_B_SHIFT)
188 | (rhs->s * DESC_S_MASK)
189 | (rhs->l << DESC_L_SHIFT)
190 | (rhs->g * DESC_G_MASK)
191 | (rhs->avl * DESC_AVL_MASK);
194 /* the reset values of qemu are not compatible to SVM
195 * this function is used to fix the segment descriptor values */
196 static void fix_realmode_dataseg(struct kvm_segment *seg)
198 seg->type = 0x02;
199 seg->present = 1;
200 seg->s = 1;
203 static void load_regs(CPUState *env)
205 struct kvm_regs regs;
206 struct kvm_fpu fpu;
207 struct kvm_sregs sregs;
208 struct kvm_msr_entry msrs[MSR_COUNT];
209 int rc, n, i;
211 regs.rax = env->regs[R_EAX];
212 regs.rbx = env->regs[R_EBX];
213 regs.rcx = env->regs[R_ECX];
214 regs.rdx = env->regs[R_EDX];
215 regs.rsi = env->regs[R_ESI];
216 regs.rdi = env->regs[R_EDI];
217 regs.rsp = env->regs[R_ESP];
218 regs.rbp = env->regs[R_EBP];
219 #ifdef TARGET_X86_64
220 regs.r8 = env->regs[8];
221 regs.r9 = env->regs[9];
222 regs.r10 = env->regs[10];
223 regs.r11 = env->regs[11];
224 regs.r12 = env->regs[12];
225 regs.r13 = env->regs[13];
226 regs.r14 = env->regs[14];
227 regs.r15 = env->regs[15];
228 #endif
230 regs.rflags = env->eflags;
231 regs.rip = env->eip;
233 kvm_set_regs(kvm_context, env->cpu_index, &regs);
235 memset(&fpu, 0, sizeof fpu);
236 fpu.fsw = env->fpus & ~(7 << 11);
237 fpu.fsw |= (env->fpstt & 7) << 11;
238 fpu.fcw = env->fpuc;
239 for (i = 0; i < 8; ++i)
240 fpu.ftwx |= (!env->fptags[i]) << i;
241 memcpy(fpu.fpr, env->fpregs, sizeof env->fpregs);
242 memcpy(fpu.xmm, env->xmm_regs, sizeof env->xmm_regs);
243 fpu.mxcsr = env->mxcsr;
244 kvm_set_fpu(kvm_context, env->cpu_index, &fpu);
246 memcpy(sregs.interrupt_bitmap, env->kvm_interrupt_bitmap, sizeof(sregs.interrupt_bitmap));
248 if ((env->eflags & VM_MASK)) {
249 set_v8086_seg(&sregs.cs, &env->segs[R_CS]);
250 set_v8086_seg(&sregs.ds, &env->segs[R_DS]);
251 set_v8086_seg(&sregs.es, &env->segs[R_ES]);
252 set_v8086_seg(&sregs.fs, &env->segs[R_FS]);
253 set_v8086_seg(&sregs.gs, &env->segs[R_GS]);
254 set_v8086_seg(&sregs.ss, &env->segs[R_SS]);
255 } else {
256 set_seg(&sregs.cs, &env->segs[R_CS]);
257 set_seg(&sregs.ds, &env->segs[R_DS]);
258 set_seg(&sregs.es, &env->segs[R_ES]);
259 set_seg(&sregs.fs, &env->segs[R_FS]);
260 set_seg(&sregs.gs, &env->segs[R_GS]);
261 set_seg(&sregs.ss, &env->segs[R_SS]);
263 if (env->cr[0] & CR0_PE_MASK) {
264 /* force ss cpl to cs cpl */
265 sregs.ss.selector = (sregs.ss.selector & ~3) |
266 (sregs.cs.selector & 3);
267 sregs.ss.dpl = sregs.ss.selector & 3;
270 if (!(env->cr[0] & CR0_PG_MASK)) {
271 fix_realmode_dataseg(&sregs.cs);
272 fix_realmode_dataseg(&sregs.ds);
273 fix_realmode_dataseg(&sregs.es);
274 fix_realmode_dataseg(&sregs.fs);
275 fix_realmode_dataseg(&sregs.gs);
276 fix_realmode_dataseg(&sregs.ss);
280 set_seg(&sregs.tr, &env->tr);
281 set_seg(&sregs.ldt, &env->ldt);
283 sregs.idt.limit = env->idt.limit;
284 sregs.idt.base = env->idt.base;
285 sregs.gdt.limit = env->gdt.limit;
286 sregs.gdt.base = env->gdt.base;
288 sregs.cr0 = env->cr[0];
289 sregs.cr2 = env->cr[2];
290 sregs.cr3 = env->cr[3];
291 sregs.cr4 = env->cr[4];
293 sregs.apic_base = cpu_get_apic_base(env);
294 sregs.efer = env->efer;
295 sregs.cr8 = cpu_get_apic_tpr(env);
297 kvm_set_sregs(kvm_context, env->cpu_index, &sregs);
299 /* msrs */
300 n = 0;
301 set_msr_entry(&msrs[n++], MSR_IA32_SYSENTER_CS, env->sysenter_cs);
302 set_msr_entry(&msrs[n++], MSR_IA32_SYSENTER_ESP, env->sysenter_esp);
303 set_msr_entry(&msrs[n++], MSR_IA32_SYSENTER_EIP, env->sysenter_eip);
304 if (kvm_has_msr_star)
305 set_msr_entry(&msrs[n++], MSR_STAR, env->star);
306 set_msr_entry(&msrs[n++], MSR_IA32_TSC, env->tsc);
307 #ifdef TARGET_X86_64
308 if (lm_capable_kernel) {
309 set_msr_entry(&msrs[n++], MSR_CSTAR, env->cstar);
310 set_msr_entry(&msrs[n++], MSR_KERNELGSBASE, env->kernelgsbase);
311 set_msr_entry(&msrs[n++], MSR_FMASK, env->fmask);
312 set_msr_entry(&msrs[n++], MSR_LSTAR , env->lstar);
314 #endif
316 rc = kvm_set_msrs(kvm_context, env->cpu_index, msrs, n);
317 if (rc == -1)
318 perror("kvm_set_msrs FAILED");
322 static void save_regs(CPUState *env)
324 struct kvm_regs regs;
325 struct kvm_fpu fpu;
326 struct kvm_sregs sregs;
327 struct kvm_msr_entry msrs[MSR_COUNT];
328 uint32_t hflags;
329 uint32_t i, n, rc;
331 kvm_get_regs(kvm_context, env->cpu_index, &regs);
333 env->regs[R_EAX] = regs.rax;
334 env->regs[R_EBX] = regs.rbx;
335 env->regs[R_ECX] = regs.rcx;
336 env->regs[R_EDX] = regs.rdx;
337 env->regs[R_ESI] = regs.rsi;
338 env->regs[R_EDI] = regs.rdi;
339 env->regs[R_ESP] = regs.rsp;
340 env->regs[R_EBP] = regs.rbp;
341 #ifdef TARGET_X86_64
342 env->regs[8] = regs.r8;
343 env->regs[9] = regs.r9;
344 env->regs[10] = regs.r10;
345 env->regs[11] = regs.r11;
346 env->regs[12] = regs.r12;
347 env->regs[13] = regs.r13;
348 env->regs[14] = regs.r14;
349 env->regs[15] = regs.r15;
350 #endif
352 env->eflags = regs.rflags;
353 env->eip = regs.rip;
355 kvm_get_fpu(kvm_context, env->cpu_index, &fpu);
356 env->fpstt = (fpu.fsw >> 11) & 7;
357 env->fpus = fpu.fsw;
358 env->fpuc = fpu.fcw;
359 for (i = 0; i < 8; ++i)
360 env->fptags[i] = !((fpu.ftwx >> i) & 1);
361 memcpy(env->fpregs, fpu.fpr, sizeof env->fpregs);
362 memcpy(env->xmm_regs, fpu.xmm, sizeof env->xmm_regs);
363 env->mxcsr = fpu.mxcsr;
365 kvm_get_sregs(kvm_context, env->cpu_index, &sregs);
367 memcpy(env->kvm_interrupt_bitmap, sregs.interrupt_bitmap, sizeof(env->kvm_interrupt_bitmap));
369 get_seg(&env->segs[R_CS], &sregs.cs);
370 get_seg(&env->segs[R_DS], &sregs.ds);
371 get_seg(&env->segs[R_ES], &sregs.es);
372 get_seg(&env->segs[R_FS], &sregs.fs);
373 get_seg(&env->segs[R_GS], &sregs.gs);
374 get_seg(&env->segs[R_SS], &sregs.ss);
376 get_seg(&env->tr, &sregs.tr);
377 get_seg(&env->ldt, &sregs.ldt);
379 env->idt.limit = sregs.idt.limit;
380 env->idt.base = sregs.idt.base;
381 env->gdt.limit = sregs.gdt.limit;
382 env->gdt.base = sregs.gdt.base;
384 env->cr[0] = sregs.cr0;
385 env->cr[2] = sregs.cr2;
386 env->cr[3] = sregs.cr3;
387 env->cr[4] = sregs.cr4;
389 cpu_set_apic_base(env, sregs.apic_base);
391 env->efer = sregs.efer;
392 //cpu_set_apic_tpr(env, sregs.cr8);
394 #define HFLAG_COPY_MASK ~( \
395 HF_CPL_MASK | HF_PE_MASK | HF_MP_MASK | HF_EM_MASK | \
396 HF_TS_MASK | HF_TF_MASK | HF_VM_MASK | HF_IOPL_MASK | \
397 HF_OSFXSR_MASK | HF_LMA_MASK | HF_CS32_MASK | \
398 HF_SS32_MASK | HF_CS64_MASK | HF_ADDSEG_MASK)
402 hflags = (env->segs[R_CS].flags >> DESC_DPL_SHIFT) & HF_CPL_MASK;
403 hflags |= (env->cr[0] & CR0_PE_MASK) << (HF_PE_SHIFT - CR0_PE_SHIFT);
404 hflags |= (env->cr[0] << (HF_MP_SHIFT - CR0_MP_SHIFT)) &
405 (HF_MP_MASK | HF_EM_MASK | HF_TS_MASK);
406 hflags |= (env->eflags & (HF_TF_MASK | HF_VM_MASK | HF_IOPL_MASK));
407 hflags |= (env->cr[4] & CR4_OSFXSR_MASK) <<
408 (HF_OSFXSR_SHIFT - CR4_OSFXSR_SHIFT);
410 if (env->efer & MSR_EFER_LMA) {
411 hflags |= HF_LMA_MASK;
414 if ((hflags & HF_LMA_MASK) && (env->segs[R_CS].flags & DESC_L_MASK)) {
415 hflags |= HF_CS32_MASK | HF_SS32_MASK | HF_CS64_MASK;
416 } else {
417 hflags |= (env->segs[R_CS].flags & DESC_B_MASK) >>
418 (DESC_B_SHIFT - HF_CS32_SHIFT);
419 hflags |= (env->segs[R_SS].flags & DESC_B_MASK) >>
420 (DESC_B_SHIFT - HF_SS32_SHIFT);
421 if (!(env->cr[0] & CR0_PE_MASK) ||
422 (env->eflags & VM_MASK) ||
423 !(hflags & HF_CS32_MASK)) {
424 hflags |= HF_ADDSEG_MASK;
425 } else {
426 hflags |= ((env->segs[R_DS].base |
427 env->segs[R_ES].base |
428 env->segs[R_SS].base) != 0) <<
429 HF_ADDSEG_SHIFT;
432 env->hflags = (env->hflags & HFLAG_COPY_MASK) | hflags;
433 env->cc_src = env->eflags & (CC_O | CC_S | CC_Z | CC_A | CC_P | CC_C);
434 env->df = 1 - (2 * ((env->eflags >> 10) & 1));
435 env->cc_op = CC_OP_EFLAGS;
436 env->eflags &= ~(DF_MASK | CC_O | CC_S | CC_Z | CC_A | CC_P | CC_C);
438 /* msrs */
439 n = 0;
440 msrs[n++].index = MSR_IA32_SYSENTER_CS;
441 msrs[n++].index = MSR_IA32_SYSENTER_ESP;
442 msrs[n++].index = MSR_IA32_SYSENTER_EIP;
443 if (kvm_has_msr_star)
444 msrs[n++].index = MSR_STAR;
445 msrs[n++].index = MSR_IA32_TSC;
446 #ifdef TARGET_X86_64
447 if (lm_capable_kernel) {
448 msrs[n++].index = MSR_CSTAR;
449 msrs[n++].index = MSR_KERNELGSBASE;
450 msrs[n++].index = MSR_FMASK;
451 msrs[n++].index = MSR_LSTAR;
453 #endif
454 rc = kvm_get_msrs(kvm_context, env->cpu_index, msrs, n);
455 if (rc == -1) {
456 perror("kvm_get_msrs FAILED");
458 else {
459 n = rc; /* actual number of MSRs */
460 for (i=0 ; i<n; i++) {
461 if (get_msr_entry(&msrs[i], env))
462 return;
467 #include <signal.h>
470 static int try_push_interrupts(void *opaque)
472 CPUState *env = cpu_single_env;
473 int r, irq;
475 if (env->ready_for_interrupt_injection &&
476 (env->interrupt_request & CPU_INTERRUPT_HARD) &&
477 (env->eflags & IF_MASK)) {
478 env->interrupt_request &= ~CPU_INTERRUPT_HARD;
479 irq = cpu_get_pic_interrupt(env);
480 if (irq >= 0) {
481 r = kvm_inject_irq(kvm_context, env->cpu_index, irq);
482 if (r < 0)
483 printf("cpu %d fail inject %x\n", env->cpu_index, irq);
487 return (env->interrupt_request & CPU_INTERRUPT_HARD) != 0;
490 static void post_kvm_run(void *opaque, int vcpu)
492 CPUState *env = vcpu_env;
494 pthread_mutex_lock(&qemu_mutex);
495 cpu_single_env = env;
496 env->eflags = kvm_get_interrupt_flag(kvm_context, vcpu)
497 ? env->eflags | IF_MASK : env->eflags & ~IF_MASK;
498 env->ready_for_interrupt_injection
499 = kvm_is_ready_for_interrupt_injection(kvm_context, vcpu);
501 cpu_set_apic_tpr(env, kvm_get_cr8(kvm_context, vcpu));
502 cpu_set_apic_base(env, kvm_get_apic_base(kvm_context, vcpu));
505 static int pre_kvm_run(void *opaque, int vcpu)
507 CPUState *env = cpu_single_env;
509 if (env->cpu_index == 0 && wait_hack) {
510 int i;
512 wait_hack = 0;
514 pthread_mutex_unlock(&qemu_mutex);
515 for (i = 0; i < 10; ++i)
516 usleep(1000);
517 pthread_mutex_lock(&qemu_mutex);
520 kvm_set_cr8(kvm_context, vcpu, cpu_get_apic_tpr(env));
521 if (env->interrupt_request & CPU_INTERRUPT_EXIT)
522 return 1;
523 pthread_mutex_unlock(&qemu_mutex);
524 return 0;
527 void kvm_load_registers(CPUState *env)
529 if (kvm_allowed)
530 load_regs(env);
533 void kvm_save_registers(CPUState *env)
535 if (kvm_allowed)
536 save_regs(env);
539 int kvm_cpu_exec(CPUState *env)
541 int r;
543 r = kvm_run(kvm_context, env->cpu_index);
544 if (r < 0) {
545 printf("kvm_run returned %d\n", r);
546 exit(1);
549 return 0;
552 extern int vm_running;
554 static int has_work(CPUState *env)
556 if (!vm_running)
557 return 0;
558 if (!(env->hflags & HF_HALTED_MASK))
559 return 1;
560 if (env->interrupt_request & (CPU_INTERRUPT_HARD | CPU_INTERRUPT_EXIT))
561 return 1;
562 return 0;
565 static int kvm_eat_signal(CPUState *env, int timeout)
567 struct timespec ts;
568 int r, e, ret = 0;
569 siginfo_t siginfo;
570 struct sigaction sa;
572 ts.tv_sec = timeout / 1000;
573 ts.tv_nsec = (timeout % 1000) * 1000000;
574 r = sigtimedwait(&io_sigset, &siginfo, &ts);
575 if (r == -1 && (errno == EAGAIN || errno == EINTR) && !timeout)
576 return 0;
577 e = errno;
578 pthread_mutex_lock(&qemu_mutex);
579 cpu_single_env = vcpu_env;
580 if (r == -1 && !(errno == EAGAIN || errno == EINTR)) {
581 printf("sigtimedwait: %s\n", strerror(e));
582 exit(1);
584 if (r != -1) {
585 sigaction(siginfo.si_signo, NULL, &sa);
586 sa.sa_handler(siginfo.si_signo);
587 ret = 1;
589 pthread_mutex_unlock(&qemu_mutex);
591 return ret;
595 static void kvm_eat_signals(CPUState *env, int timeout)
597 int r = 0;
599 while (kvm_eat_signal(env, 0))
600 r = 1;
601 if (!r && timeout) {
602 r = kvm_eat_signal(env, timeout);
603 if (r)
604 while (kvm_eat_signal(env, 0))
608 * we call select() even if no signal was received, to account for
609 * for which there is no signal handler installed.
611 pthread_mutex_lock(&qemu_mutex);
612 cpu_single_env = vcpu_env;
613 main_loop_wait(0);
614 pthread_mutex_unlock(&qemu_mutex);
617 static void kvm_main_loop_wait(CPUState *env, int timeout)
619 pthread_mutex_unlock(&qemu_mutex);
620 if (env->cpu_index == 0)
621 kvm_eat_signals(env, timeout);
622 else {
623 if (timeout || vcpu_info[env->cpu_index].stopped) {
624 sigset_t set;
625 int n;
627 paused:
628 sigemptyset(&set);
629 sigaddset(&set, SIG_IPI);
630 sigwait(&set, &n);
631 } else {
632 struct timespec ts;
633 siginfo_t siginfo;
634 sigset_t set;
636 ts.tv_sec = 0;
637 ts.tv_nsec = 0;
638 sigemptyset(&set);
639 sigaddset(&set, SIG_IPI);
640 sigtimedwait(&io_sigset, &siginfo, &ts);
642 if (vcpu_info[env->cpu_index].stop) {
643 vcpu_info[env->cpu_index].stop = 0;
644 vcpu_info[env->cpu_index].stopped = 1;
645 pthread_kill(vcpu_info[0].thread, SIG_IPI);
646 goto paused;
649 pthread_mutex_lock(&qemu_mutex);
650 cpu_single_env = env;
651 vcpu_info[env->cpu_index].signalled = 0;
654 static int all_threads_paused(void)
656 int i;
658 for (i = 1; i < smp_cpus; ++i)
659 if (vcpu_info[i].stopped)
660 return 0;
661 return 1;
664 static void pause_other_threads(void)
666 int i;
668 for (i = 1; i < smp_cpus; ++i) {
669 vcpu_info[i].stop = 1;
670 pthread_kill(vcpu_info[i].thread, SIG_IPI);
672 while (!all_threads_paused())
673 kvm_eat_signals(vcpu_env, 0);
676 static void resume_other_threads(void)
678 int i;
680 for (i = 1; i < smp_cpus; ++i) {
681 vcpu_info[i].stop = 0;
682 vcpu_info[i].stopped = 0;
683 pthread_kill(vcpu_info[i].thread, SIG_IPI);
687 static void kvm_vm_state_change_handler(void *context, int running)
689 if (running)
690 resume_other_threads();
691 else
692 pause_other_threads();
695 static void update_regs_for_sipi(CPUState *env)
697 SegmentCache cs = env->segs[R_CS];
699 save_regs(env);
700 env->segs[R_CS] = cs;
701 env->eip = 0;
702 load_regs(env);
703 vcpu_info[env->cpu_index].sipi_needed = 0;
704 vcpu_info[env->cpu_index].init = 0;
707 static void update_regs_for_init(CPUState *env)
709 cpu_reset(env);
710 load_regs(env);
713 static void setup_kernel_sigmask(CPUState *env)
715 sigset_t set;
717 sigprocmask(SIG_BLOCK, NULL, &set);
718 sigdelset(&set, SIG_IPI);
719 if (env->cpu_index == 0)
720 sigandset(&set, &set, &io_negsigset);
722 kvm_set_signal_mask(kvm_context, env->cpu_index, &set);
725 static int kvm_main_loop_cpu(CPUState *env)
727 struct vcpu_info *info = &vcpu_info[env->cpu_index];
729 setup_kernel_sigmask(env);
730 pthread_mutex_lock(&qemu_mutex);
731 cpu_single_env = env;
732 while (1) {
733 while (!has_work(env))
734 kvm_main_loop_wait(env, 10);
735 if (env->interrupt_request & CPU_INTERRUPT_HARD)
736 env->hflags &= ~HF_HALTED_MASK;
737 if (info->sipi_needed)
738 update_regs_for_sipi(env);
739 if (info->init)
740 update_regs_for_init(env);
741 if (!(env->hflags & HF_HALTED_MASK) && !info->init)
742 kvm_cpu_exec(env);
743 env->interrupt_request &= ~CPU_INTERRUPT_EXIT;
744 kvm_main_loop_wait(env, 0);
745 if (qemu_shutdown_requested())
746 break;
747 else if (qemu_powerdown_requested())
748 qemu_system_powerdown();
749 else if (qemu_reset_requested()) {
750 env->interrupt_request = 0;
751 qemu_system_reset();
752 load_regs(env);
755 pthread_mutex_unlock(&qemu_mutex);
756 return 0;
759 static void *ap_main_loop(void *_env)
761 CPUState *env = _env;
762 sigset_t signals;
764 vcpu_env = env;
765 sigfillset(&signals);
766 //sigdelset(&signals, SIG_IPI);
767 sigprocmask(SIG_BLOCK, &signals, NULL);
768 kvm_create_vcpu(kvm_context, env->cpu_index);
769 kvm_qemu_init_env(env);
770 kvm_main_loop_cpu(env);
771 return NULL;
774 static void kvm_add_signal(int signum)
776 sigaddset(&io_sigset, signum);
777 sigdelset(&io_negsigset, signum);
778 sigprocmask(SIG_BLOCK, &io_sigset, NULL);
781 int kvm_init_ap(void)
783 CPUState *env = first_cpu->next_cpu;
784 int i;
786 qemu_add_vm_change_state_handler(kvm_vm_state_change_handler, NULL);
787 sigemptyset(&io_sigset);
788 sigfillset(&io_negsigset);
789 kvm_add_signal(SIGIO);
790 kvm_add_signal(SIGALRM);
791 kvm_add_signal(SIGUSR2);
792 kvm_add_signal(SIG_IPI);
794 vcpu_env = first_cpu;
795 signal(SIG_IPI, sig_ipi_handler);
796 for (i = 1; i < smp_cpus; ++i) {
797 pthread_create(&vcpu_info[i].thread, NULL, ap_main_loop, env);
798 env = env->next_cpu;
800 return 0;
803 int kvm_main_loop(void)
805 vcpu_info[0].thread = pthread_self();
806 return kvm_main_loop_cpu(first_cpu);
809 static int kvm_debug(void *opaque, int vcpu)
811 CPUState *env = cpu_single_env;
813 env->exception_index = EXCP_DEBUG;
814 return 1;
817 static int kvm_inb(void *opaque, uint16_t addr, uint8_t *data)
819 *data = cpu_inb(0, addr);
820 return 0;
823 static int kvm_inw(void *opaque, uint16_t addr, uint16_t *data)
825 *data = cpu_inw(0, addr);
826 return 0;
829 static int kvm_inl(void *opaque, uint16_t addr, uint32_t *data)
831 *data = cpu_inl(0, addr);
832 return 0;
835 #define PM_IO_BASE 0xb000
837 static int kvm_outb(void *opaque, uint16_t addr, uint8_t data)
839 if (addr == 0xb2) {
840 switch (data) {
841 case 0: {
842 cpu_outb(0, 0xb3, 0);
843 break;
845 case 0xf0: {
846 unsigned x;
848 /* enable acpi */
849 x = cpu_inw(0, PM_IO_BASE + 4);
850 x &= ~1;
851 cpu_outw(0, PM_IO_BASE + 4, x);
852 break;
854 case 0xf1: {
855 unsigned x;
857 /* enable acpi */
858 x = cpu_inw(0, PM_IO_BASE + 4);
859 x |= 1;
860 cpu_outw(0, PM_IO_BASE + 4, x);
861 break;
863 default:
864 break;
866 return 0;
868 cpu_outb(0, addr, data);
869 return 0;
872 static int kvm_outw(void *opaque, uint16_t addr, uint16_t data)
874 cpu_outw(0, addr, data);
875 return 0;
878 static int kvm_outl(void *opaque, uint16_t addr, uint32_t data)
880 cpu_outl(0, addr, data);
881 return 0;
884 static int kvm_readb(void *opaque, uint64_t addr, uint8_t *data)
886 *data = ldub_phys(addr);
887 return 0;
890 static int kvm_readw(void *opaque, uint64_t addr, uint16_t *data)
892 *data = lduw_phys(addr);
893 return 0;
896 static int kvm_readl(void *opaque, uint64_t addr, uint32_t *data)
898 *data = ldl_phys(addr);
899 return 0;
902 static int kvm_readq(void *opaque, uint64_t addr, uint64_t *data)
904 *data = ldq_phys(addr);
905 return 0;
908 static int kvm_writeb(void *opaque, uint64_t addr, uint8_t data)
910 stb_phys(addr, data);
911 return 0;
914 static int kvm_writew(void *opaque, uint64_t addr, uint16_t data)
916 stw_phys(addr, data);
917 return 0;
920 static int kvm_writel(void *opaque, uint64_t addr, uint32_t data)
922 stl_phys(addr, data);
923 return 0;
926 static int kvm_writeq(void *opaque, uint64_t addr, uint64_t data)
928 stq_phys(addr, data);
929 return 0;
932 static int kvm_io_window(void *opaque)
934 return 1;
938 static int kvm_halt(void *opaque, int vcpu)
940 CPUState *env = cpu_single_env;
942 if (!((env->interrupt_request & CPU_INTERRUPT_HARD) &&
943 (env->eflags & IF_MASK))) {
944 env->hflags |= HF_HALTED_MASK;
945 env->exception_index = EXCP_HLT;
948 return 1;
951 static int kvm_shutdown(void *opaque, int vcpu)
953 qemu_system_reset_request();
954 return 1;
957 static struct kvm_callbacks qemu_kvm_ops = {
958 .debug = kvm_debug,
959 .inb = kvm_inb,
960 .inw = kvm_inw,
961 .inl = kvm_inl,
962 .outb = kvm_outb,
963 .outw = kvm_outw,
964 .outl = kvm_outl,
965 .readb = kvm_readb,
966 .readw = kvm_readw,
967 .readl = kvm_readl,
968 .readq = kvm_readq,
969 .writeb = kvm_writeb,
970 .writew = kvm_writew,
971 .writel = kvm_writel,
972 .writeq = kvm_writeq,
973 .halt = kvm_halt,
974 .shutdown = kvm_shutdown,
975 .io_window = kvm_io_window,
976 .try_push_interrupts = try_push_interrupts,
977 .post_kvm_run = post_kvm_run,
978 .pre_kvm_run = pre_kvm_run,
981 int kvm_qemu_init()
983 /* Try to initialize kvm */
984 kvm_context = kvm_init(&qemu_kvm_ops, cpu_single_env);
985 if (!kvm_context) {
986 return -1;
989 return 0;
992 int kvm_qemu_create_context(void)
994 int i;
996 if (kvm_create(kvm_context, phys_ram_size, (void**)&phys_ram_base) < 0) {
997 kvm_qemu_destroy();
998 return -1;
1000 kvm_msr_list = kvm_get_msr_list(kvm_context);
1001 if (!kvm_msr_list) {
1002 kvm_qemu_destroy();
1003 return -1;
1005 for (i = 0; i < kvm_msr_list->nmsrs; ++i)
1006 if (kvm_msr_list->indices[i] == MSR_STAR)
1007 kvm_has_msr_star = 1;
1008 return 0;
1011 void kvm_qemu_destroy(void)
1013 kvm_finalize(kvm_context);
1016 static void host_cpuid(uint32_t function, uint32_t *eax, uint32_t *ebx,
1017 uint32_t *ecx, uint32_t *edx)
1019 uint32_t vec[4];
1021 vec[0] = function;
1022 asm volatile (
1023 #ifdef __x86_64__
1024 "sub $128, %%rsp \n\t" /* skip red zone */
1025 "push %0; push %%rsi \n\t"
1026 "push %%rax; push %%rbx; push %%rcx; push %%rdx \n\t"
1027 "mov 8*5(%%rsp), %%rsi \n\t"
1028 "mov (%%rsi), %%eax \n\t"
1029 "cpuid \n\t"
1030 "mov %%eax, (%%rsi) \n\t"
1031 "mov %%ebx, 4(%%rsi) \n\t"
1032 "mov %%ecx, 8(%%rsi) \n\t"
1033 "mov %%edx, 12(%%rsi) \n\t"
1034 "pop %%rdx; pop %%rcx; pop %%rbx; pop %%rax \n\t"
1035 "pop %%rsi; pop %0 \n\t"
1036 "add $128, %%rsp"
1037 #else
1038 "push %0; push %%esi \n\t"
1039 "push %%eax; push %%ebx; push %%ecx; push %%edx \n\t"
1040 "mov 4*5(%%esp), %%esi \n\t"
1041 "mov (%%esi), %%eax \n\t"
1042 "cpuid \n\t"
1043 "mov %%eax, (%%esi) \n\t"
1044 "mov %%ebx, 4(%%esi) \n\t"
1045 "mov %%ecx, 8(%%esi) \n\t"
1046 "mov %%edx, 12(%%esi) \n\t"
1047 "pop %%edx; pop %%ecx; pop %%ebx; pop %%eax \n\t"
1048 "pop %%esi; pop %0 \n\t"
1049 #endif
1050 : : "rm"(vec) : "memory");
1051 if (eax)
1052 *eax = vec[0];
1053 if (ebx)
1054 *ebx = vec[1];
1055 if (ecx)
1056 *ecx = vec[2];
1057 if (edx)
1058 *edx = vec[3];
1061 static void do_cpuid_ent(struct kvm_cpuid_entry *e, uint32_t function,
1062 CPUState *env)
1064 env->regs[R_EAX] = function;
1065 qemu_kvm_cpuid_on_env(env);
1066 e->function = function;
1067 e->eax = env->regs[R_EAX];
1068 e->ebx = env->regs[R_EBX];
1069 e->ecx = env->regs[R_ECX];
1070 e->edx = env->regs[R_EDX];
1071 if (function == 0x80000001) {
1072 uint32_t h_eax, h_edx;
1073 struct utsname utsname;
1075 host_cpuid(function, &h_eax, NULL, NULL, &h_edx);
1076 uname(&utsname);
1077 lm_capable_kernel = strcmp(utsname.machine, "x86_64") == 0;
1079 // long mode
1080 if ((h_edx & 0x20000000) == 0 || !lm_capable_kernel)
1081 e->edx &= ~0x20000000u;
1082 // syscall
1083 if ((h_edx & 0x00000800) == 0)
1084 e->edx &= ~0x00000800u;
1085 // nx
1086 if ((h_edx & 0x00100000) == 0)
1087 e->edx &= ~0x00100000u;
1089 // sysenter isn't supported on compatibility mode on AMD. and syscall
1090 // isn't supported in compatibility mode on Intel. so advertise the
1091 // actuall cpu, and say goodbye to migration between different vendors
1092 // is you use compatibility mode.
1093 if (function == 0) {
1094 uint32_t bcd[3];
1096 host_cpuid(0, NULL, &bcd[0], &bcd[1], &bcd[2]);
1097 e->ebx = bcd[0];
1098 e->ecx = bcd[1];
1099 e->edx = bcd[2];
1103 int kvm_qemu_init_env(CPUState *cenv)
1105 struct kvm_cpuid_entry cpuid_ent[100];
1106 int cpuid_nent = 0;
1107 CPUState copy;
1108 uint32_t i, limit;
1110 copy = *cenv;
1112 copy.regs[R_EAX] = 0;
1113 qemu_kvm_cpuid_on_env(&copy);
1114 limit = copy.regs[R_EAX];
1116 for (i = 0; i <= limit; ++i)
1117 do_cpuid_ent(&cpuid_ent[cpuid_nent++], i, &copy);
1119 copy.regs[R_EAX] = 0x80000000;
1120 qemu_kvm_cpuid_on_env(&copy);
1121 limit = copy.regs[R_EAX];
1123 for (i = 0x80000000; i <= limit; ++i)
1124 do_cpuid_ent(&cpuid_ent[cpuid_nent++], i, &copy);
1126 kvm_setup_cpuid(kvm_context, cenv->cpu_index, cpuid_nent, cpuid_ent);
1128 return 0;
1131 int kvm_update_debugger(CPUState *env)
1133 struct kvm_debug_guest dbg;
1134 int i;
1136 dbg.enabled = 0;
1137 if (env->nb_breakpoints || env->singlestep_enabled) {
1138 dbg.enabled = 1;
1139 for (i = 0; i < 4 && i < env->nb_breakpoints; ++i) {
1140 dbg.breakpoints[i].enabled = 1;
1141 dbg.breakpoints[i].address = env->breakpoints[i];
1143 dbg.singlestep = env->singlestep_enabled;
1145 return kvm_guest_debug(kvm_context, env->cpu_index, &dbg);
1150 * dirty pages logging
1152 /* FIXME: use unsigned long pointer instead of unsigned char */
1153 unsigned char *kvm_dirty_bitmap = NULL;
1154 int kvm_physical_memory_set_dirty_tracking(int enable)
1156 int r = 0;
1158 if (!kvm_allowed)
1159 return 0;
1161 if (enable) {
1162 if (!kvm_dirty_bitmap) {
1163 unsigned bitmap_size = BITMAP_SIZE(phys_ram_size);
1164 kvm_dirty_bitmap = qemu_malloc(bitmap_size);
1165 if (kvm_dirty_bitmap == NULL) {
1166 perror("Failed to allocate dirty pages bitmap");
1167 r=-1;
1169 else {
1170 r = kvm_dirty_pages_log_enable_all(kvm_context);
1174 else {
1175 if (kvm_dirty_bitmap) {
1176 r = kvm_dirty_pages_log_reset(kvm_context);
1177 qemu_free(kvm_dirty_bitmap);
1178 kvm_dirty_bitmap = NULL;
1181 return r;
1184 /* get kvm's dirty pages bitmap and update qemu's */
1185 int kvm_get_dirty_pages_log_slot(int slot,
1186 unsigned char *bitmap,
1187 unsigned int offset,
1188 unsigned int len)
1190 int r;
1191 unsigned int i, j, n=0;
1192 unsigned char c;
1193 unsigned page_number, addr, addr1;
1195 memset(bitmap, 0, len);
1196 r = kvm_get_dirty_pages(kvm_context, slot, bitmap);
1197 if (r)
1198 return r;
1201 * bitmap-traveling is faster than memory-traveling (for addr...)
1202 * especially when most of the memory is not dirty.
1204 for (i=0; i<len; i++) {
1205 c = bitmap[i];
1206 while (c>0) {
1207 j = ffsl(c) - 1;
1208 c &= ~(1u<<j);
1209 page_number = i * 8 + j;
1210 addr1 = page_number * TARGET_PAGE_SIZE;
1211 addr = offset + addr1;
1212 cpu_physical_memory_set_dirty(addr);
1213 n++;
1216 return 0;
1220 * get kvm's dirty pages bitmap and update qemu's
1221 * we only care about physical ram, which resides in slots 0 and 3
1223 int kvm_update_dirty_pages_log(void)
1225 int r = 0, len;
1227 len = BITMAP_SIZE(0xa0000);
1228 r = kvm_get_dirty_pages_log_slot(3, kvm_dirty_bitmap, 0 , len);
1229 len = BITMAP_SIZE(phys_ram_size - 0xc0000);
1230 r = r || kvm_get_dirty_pages_log_slot(0, kvm_dirty_bitmap, 0xc0000, len);
1231 return r;
1234 int kvm_get_phys_ram_page_bitmap(unsigned char *bitmap)
1236 int r=0, len, offset;
1238 len = BITMAP_SIZE(phys_ram_size);
1239 memset(bitmap, 0, len);
1241 r = kvm_get_mem_map(kvm_context, 3, bitmap);
1242 if (r)
1243 goto out;
1245 offset = BITMAP_SIZE(0xc0000);
1246 r = kvm_get_mem_map(kvm_context, 0, bitmap + offset);
1248 out:
1249 return r;
1252 int kvm_set_irq(int irq, int level)
1254 return kvm_set_irq_level(kvm_context, irq, level);
1256 #endif