kvm: kvmctl: Fix building of kvmctl
[qemu-kvm/amd-iommu.git] / qemu-kvm.c
blob716b8fb51346036d7a88c68d8024fb5c7c051eb3
2 #include "config.h"
3 #include "config-host.h"
5 #ifdef USE_KVM
6 #define KVM_ALLOWED_DEFAULT 1
7 #else
8 #define KVM_ALLOWED_DEFAULT 0
9 #endif
11 int kvm_allowed = KVM_ALLOWED_DEFAULT;
12 static int lm_capable_kernel;
13 int kvm_irqchip = 1;
15 #ifdef USE_KVM
17 #include <string.h>
18 #include "vl.h"
20 #include "qemu-kvm.h"
21 #include <libkvm.h>
22 #include <pthread.h>
23 #include <sys/utsname.h>
25 #define MSR_IA32_TSC 0x10
27 extern void perror(const char *s);
29 kvm_context_t kvm_context;
30 static struct kvm_msr_list *kvm_msr_list;
31 static int kvm_has_msr_star;
33 extern int smp_cpus;
34 extern unsigned int kvm_shadow_memory;
36 pthread_mutex_t qemu_mutex = PTHREAD_MUTEX_INITIALIZER;
37 static __thread CPUState *vcpu_env;
39 static sigset_t io_sigset, io_negsigset;
41 static int wait_hack;
43 #define SIG_IPI (SIGRTMIN+4)
45 struct vcpu_info {
46 int sipi_needed;
47 int init;
48 pthread_t thread;
49 int signalled;
50 int stop;
51 int stopped;
52 } vcpu_info[4];
54 static void sig_ipi_handler(int n)
58 void kvm_update_interrupt_request(CPUState *env)
60 if (env && env != vcpu_env) {
61 if (vcpu_info[env->cpu_index].signalled)
62 return;
63 vcpu_info[env->cpu_index].signalled = 1;
64 if (vcpu_info[env->cpu_index].thread)
65 pthread_kill(vcpu_info[env->cpu_index].thread, SIG_IPI);
69 void kvm_update_after_sipi(CPUState *env)
71 vcpu_info[env->cpu_index].sipi_needed = 1;
72 kvm_update_interrupt_request(env);
75 * the qemu bios waits using a busy loop that's much too short for
76 * kvm. add a wait after the first sipi.
79 static int first_sipi = 1;
81 if (first_sipi) {
82 wait_hack = 1;
83 first_sipi = 0;
88 void kvm_apic_init(CPUState *env)
90 if (env->cpu_index != 0)
91 vcpu_info[env->cpu_index].init = 1;
92 kvm_update_interrupt_request(env);
95 static void set_msr_entry(struct kvm_msr_entry *entry, uint32_t index,
96 uint64_t data)
98 entry->index = index;
99 entry->data = data;
102 /* returns 0 on success, non-0 on failure */
103 static int get_msr_entry(struct kvm_msr_entry *entry, CPUState *env)
105 switch (entry->index) {
106 case MSR_IA32_SYSENTER_CS:
107 env->sysenter_cs = entry->data;
108 break;
109 case MSR_IA32_SYSENTER_ESP:
110 env->sysenter_esp = entry->data;
111 break;
112 case MSR_IA32_SYSENTER_EIP:
113 env->sysenter_eip = entry->data;
114 break;
115 case MSR_STAR:
116 env->star = entry->data;
117 break;
118 #ifdef TARGET_X86_64
119 case MSR_CSTAR:
120 env->cstar = entry->data;
121 break;
122 case MSR_KERNELGSBASE:
123 env->kernelgsbase = entry->data;
124 break;
125 case MSR_FMASK:
126 env->fmask = entry->data;
127 break;
128 case MSR_LSTAR:
129 env->lstar = entry->data;
130 break;
131 #endif
132 case MSR_IA32_TSC:
133 env->tsc = entry->data;
134 break;
135 default:
136 printf("Warning unknown msr index 0x%x\n", entry->index);
137 return 1;
139 return 0;
142 #ifdef TARGET_X86_64
143 #define MSR_COUNT 9
144 #else
145 #define MSR_COUNT 5
146 #endif
148 static void set_v8086_seg(struct kvm_segment *lhs, const SegmentCache *rhs)
150 lhs->selector = rhs->selector;
151 lhs->base = rhs->base;
152 lhs->limit = rhs->limit;
153 lhs->type = 3;
154 lhs->present = 1;
155 lhs->dpl = 3;
156 lhs->db = 0;
157 lhs->s = 1;
158 lhs->l = 0;
159 lhs->g = 0;
160 lhs->avl = 0;
161 lhs->unusable = 0;
164 static void set_seg(struct kvm_segment *lhs, const SegmentCache *rhs)
166 unsigned flags = rhs->flags;
167 lhs->selector = rhs->selector;
168 lhs->base = rhs->base;
169 lhs->limit = rhs->limit;
170 lhs->type = (flags >> DESC_TYPE_SHIFT) & 15;
171 lhs->present = (flags & DESC_P_MASK) != 0;
172 lhs->dpl = rhs->selector & 3;
173 lhs->db = (flags >> DESC_B_SHIFT) & 1;
174 lhs->s = (flags & DESC_S_MASK) != 0;
175 lhs->l = (flags >> DESC_L_SHIFT) & 1;
176 lhs->g = (flags & DESC_G_MASK) != 0;
177 lhs->avl = (flags & DESC_AVL_MASK) != 0;
178 lhs->unusable = 0;
181 static void get_seg(SegmentCache *lhs, const struct kvm_segment *rhs)
183 lhs->selector = rhs->selector;
184 lhs->base = rhs->base;
185 lhs->limit = rhs->limit;
186 lhs->flags =
187 (rhs->type << DESC_TYPE_SHIFT)
188 | (rhs->present * DESC_P_MASK)
189 | (rhs->dpl << DESC_DPL_SHIFT)
190 | (rhs->db << DESC_B_SHIFT)
191 | (rhs->s * DESC_S_MASK)
192 | (rhs->l << DESC_L_SHIFT)
193 | (rhs->g * DESC_G_MASK)
194 | (rhs->avl * DESC_AVL_MASK);
197 /* the reset values of qemu are not compatible to SVM
198 * this function is used to fix the segment descriptor values */
199 static void fix_realmode_dataseg(struct kvm_segment *seg)
201 seg->type = 0x02;
202 seg->present = 1;
203 seg->s = 1;
206 static void load_regs(CPUState *env)
208 struct kvm_regs regs;
209 struct kvm_fpu fpu;
210 struct kvm_sregs sregs;
211 struct kvm_msr_entry msrs[MSR_COUNT];
212 int rc, n, i;
214 regs.rax = env->regs[R_EAX];
215 regs.rbx = env->regs[R_EBX];
216 regs.rcx = env->regs[R_ECX];
217 regs.rdx = env->regs[R_EDX];
218 regs.rsi = env->regs[R_ESI];
219 regs.rdi = env->regs[R_EDI];
220 regs.rsp = env->regs[R_ESP];
221 regs.rbp = env->regs[R_EBP];
222 #ifdef TARGET_X86_64
223 regs.r8 = env->regs[8];
224 regs.r9 = env->regs[9];
225 regs.r10 = env->regs[10];
226 regs.r11 = env->regs[11];
227 regs.r12 = env->regs[12];
228 regs.r13 = env->regs[13];
229 regs.r14 = env->regs[14];
230 regs.r15 = env->regs[15];
231 #endif
233 regs.rflags = env->eflags;
234 regs.rip = env->eip;
236 kvm_set_regs(kvm_context, env->cpu_index, &regs);
238 memset(&fpu, 0, sizeof fpu);
239 fpu.fsw = env->fpus & ~(7 << 11);
240 fpu.fsw |= (env->fpstt & 7) << 11;
241 fpu.fcw = env->fpuc;
242 for (i = 0; i < 8; ++i)
243 fpu.ftwx |= (!env->fptags[i]) << i;
244 memcpy(fpu.fpr, env->fpregs, sizeof env->fpregs);
245 memcpy(fpu.xmm, env->xmm_regs, sizeof env->xmm_regs);
246 fpu.mxcsr = env->mxcsr;
247 kvm_set_fpu(kvm_context, env->cpu_index, &fpu);
249 memcpy(sregs.interrupt_bitmap, env->kvm_interrupt_bitmap, sizeof(sregs.interrupt_bitmap));
251 if ((env->eflags & VM_MASK)) {
252 set_v8086_seg(&sregs.cs, &env->segs[R_CS]);
253 set_v8086_seg(&sregs.ds, &env->segs[R_DS]);
254 set_v8086_seg(&sregs.es, &env->segs[R_ES]);
255 set_v8086_seg(&sregs.fs, &env->segs[R_FS]);
256 set_v8086_seg(&sregs.gs, &env->segs[R_GS]);
257 set_v8086_seg(&sregs.ss, &env->segs[R_SS]);
258 } else {
259 set_seg(&sregs.cs, &env->segs[R_CS]);
260 set_seg(&sregs.ds, &env->segs[R_DS]);
261 set_seg(&sregs.es, &env->segs[R_ES]);
262 set_seg(&sregs.fs, &env->segs[R_FS]);
263 set_seg(&sregs.gs, &env->segs[R_GS]);
264 set_seg(&sregs.ss, &env->segs[R_SS]);
266 if (env->cr[0] & CR0_PE_MASK) {
267 /* force ss cpl to cs cpl */
268 sregs.ss.selector = (sregs.ss.selector & ~3) |
269 (sregs.cs.selector & 3);
270 sregs.ss.dpl = sregs.ss.selector & 3;
273 if (!(env->cr[0] & CR0_PG_MASK)) {
274 fix_realmode_dataseg(&sregs.cs);
275 fix_realmode_dataseg(&sregs.ds);
276 fix_realmode_dataseg(&sregs.es);
277 fix_realmode_dataseg(&sregs.fs);
278 fix_realmode_dataseg(&sregs.gs);
279 fix_realmode_dataseg(&sregs.ss);
283 set_seg(&sregs.tr, &env->tr);
284 set_seg(&sregs.ldt, &env->ldt);
286 sregs.idt.limit = env->idt.limit;
287 sregs.idt.base = env->idt.base;
288 sregs.gdt.limit = env->gdt.limit;
289 sregs.gdt.base = env->gdt.base;
291 sregs.cr0 = env->cr[0];
292 sregs.cr2 = env->cr[2];
293 sregs.cr3 = env->cr[3];
294 sregs.cr4 = env->cr[4];
296 sregs.apic_base = cpu_get_apic_base(env);
297 sregs.efer = env->efer;
298 sregs.cr8 = cpu_get_apic_tpr(env);
300 kvm_set_sregs(kvm_context, env->cpu_index, &sregs);
302 /* msrs */
303 n = 0;
304 set_msr_entry(&msrs[n++], MSR_IA32_SYSENTER_CS, env->sysenter_cs);
305 set_msr_entry(&msrs[n++], MSR_IA32_SYSENTER_ESP, env->sysenter_esp);
306 set_msr_entry(&msrs[n++], MSR_IA32_SYSENTER_EIP, env->sysenter_eip);
307 if (kvm_has_msr_star)
308 set_msr_entry(&msrs[n++], MSR_STAR, env->star);
309 set_msr_entry(&msrs[n++], MSR_IA32_TSC, env->tsc);
310 #ifdef TARGET_X86_64
311 if (lm_capable_kernel) {
312 set_msr_entry(&msrs[n++], MSR_CSTAR, env->cstar);
313 set_msr_entry(&msrs[n++], MSR_KERNELGSBASE, env->kernelgsbase);
314 set_msr_entry(&msrs[n++], MSR_FMASK, env->fmask);
315 set_msr_entry(&msrs[n++], MSR_LSTAR , env->lstar);
317 #endif
319 rc = kvm_set_msrs(kvm_context, env->cpu_index, msrs, n);
320 if (rc == -1)
321 perror("kvm_set_msrs FAILED");
325 static void save_regs(CPUState *env)
327 struct kvm_regs regs;
328 struct kvm_fpu fpu;
329 struct kvm_sregs sregs;
330 struct kvm_msr_entry msrs[MSR_COUNT];
331 uint32_t hflags;
332 uint32_t i, n, rc;
334 kvm_get_regs(kvm_context, env->cpu_index, &regs);
336 env->regs[R_EAX] = regs.rax;
337 env->regs[R_EBX] = regs.rbx;
338 env->regs[R_ECX] = regs.rcx;
339 env->regs[R_EDX] = regs.rdx;
340 env->regs[R_ESI] = regs.rsi;
341 env->regs[R_EDI] = regs.rdi;
342 env->regs[R_ESP] = regs.rsp;
343 env->regs[R_EBP] = regs.rbp;
344 #ifdef TARGET_X86_64
345 env->regs[8] = regs.r8;
346 env->regs[9] = regs.r9;
347 env->regs[10] = regs.r10;
348 env->regs[11] = regs.r11;
349 env->regs[12] = regs.r12;
350 env->regs[13] = regs.r13;
351 env->regs[14] = regs.r14;
352 env->regs[15] = regs.r15;
353 #endif
355 env->eflags = regs.rflags;
356 env->eip = regs.rip;
358 kvm_get_fpu(kvm_context, env->cpu_index, &fpu);
359 env->fpstt = (fpu.fsw >> 11) & 7;
360 env->fpus = fpu.fsw;
361 env->fpuc = fpu.fcw;
362 for (i = 0; i < 8; ++i)
363 env->fptags[i] = !((fpu.ftwx >> i) & 1);
364 memcpy(env->fpregs, fpu.fpr, sizeof env->fpregs);
365 memcpy(env->xmm_regs, fpu.xmm, sizeof env->xmm_regs);
366 env->mxcsr = fpu.mxcsr;
368 kvm_get_sregs(kvm_context, env->cpu_index, &sregs);
370 memcpy(env->kvm_interrupt_bitmap, sregs.interrupt_bitmap, sizeof(env->kvm_interrupt_bitmap));
372 get_seg(&env->segs[R_CS], &sregs.cs);
373 get_seg(&env->segs[R_DS], &sregs.ds);
374 get_seg(&env->segs[R_ES], &sregs.es);
375 get_seg(&env->segs[R_FS], &sregs.fs);
376 get_seg(&env->segs[R_GS], &sregs.gs);
377 get_seg(&env->segs[R_SS], &sregs.ss);
379 get_seg(&env->tr, &sregs.tr);
380 get_seg(&env->ldt, &sregs.ldt);
382 env->idt.limit = sregs.idt.limit;
383 env->idt.base = sregs.idt.base;
384 env->gdt.limit = sregs.gdt.limit;
385 env->gdt.base = sregs.gdt.base;
387 env->cr[0] = sregs.cr0;
388 env->cr[2] = sregs.cr2;
389 env->cr[3] = sregs.cr3;
390 env->cr[4] = sregs.cr4;
392 cpu_set_apic_base(env, sregs.apic_base);
394 env->efer = sregs.efer;
395 //cpu_set_apic_tpr(env, sregs.cr8);
397 #define HFLAG_COPY_MASK ~( \
398 HF_CPL_MASK | HF_PE_MASK | HF_MP_MASK | HF_EM_MASK | \
399 HF_TS_MASK | HF_TF_MASK | HF_VM_MASK | HF_IOPL_MASK | \
400 HF_OSFXSR_MASK | HF_LMA_MASK | HF_CS32_MASK | \
401 HF_SS32_MASK | HF_CS64_MASK | HF_ADDSEG_MASK)
405 hflags = (env->segs[R_CS].flags >> DESC_DPL_SHIFT) & HF_CPL_MASK;
406 hflags |= (env->cr[0] & CR0_PE_MASK) << (HF_PE_SHIFT - CR0_PE_SHIFT);
407 hflags |= (env->cr[0] << (HF_MP_SHIFT - CR0_MP_SHIFT)) &
408 (HF_MP_MASK | HF_EM_MASK | HF_TS_MASK);
409 hflags |= (env->eflags & (HF_TF_MASK | HF_VM_MASK | HF_IOPL_MASK));
410 hflags |= (env->cr[4] & CR4_OSFXSR_MASK) <<
411 (HF_OSFXSR_SHIFT - CR4_OSFXSR_SHIFT);
413 if (env->efer & MSR_EFER_LMA) {
414 hflags |= HF_LMA_MASK;
417 if ((hflags & HF_LMA_MASK) && (env->segs[R_CS].flags & DESC_L_MASK)) {
418 hflags |= HF_CS32_MASK | HF_SS32_MASK | HF_CS64_MASK;
419 } else {
420 hflags |= (env->segs[R_CS].flags & DESC_B_MASK) >>
421 (DESC_B_SHIFT - HF_CS32_SHIFT);
422 hflags |= (env->segs[R_SS].flags & DESC_B_MASK) >>
423 (DESC_B_SHIFT - HF_SS32_SHIFT);
424 if (!(env->cr[0] & CR0_PE_MASK) ||
425 (env->eflags & VM_MASK) ||
426 !(hflags & HF_CS32_MASK)) {
427 hflags |= HF_ADDSEG_MASK;
428 } else {
429 hflags |= ((env->segs[R_DS].base |
430 env->segs[R_ES].base |
431 env->segs[R_SS].base) != 0) <<
432 HF_ADDSEG_SHIFT;
435 env->hflags = (env->hflags & HFLAG_COPY_MASK) | hflags;
436 env->cc_src = env->eflags & (CC_O | CC_S | CC_Z | CC_A | CC_P | CC_C);
437 env->df = 1 - (2 * ((env->eflags >> 10) & 1));
438 env->cc_op = CC_OP_EFLAGS;
439 env->eflags &= ~(DF_MASK | CC_O | CC_S | CC_Z | CC_A | CC_P | CC_C);
441 /* msrs */
442 n = 0;
443 msrs[n++].index = MSR_IA32_SYSENTER_CS;
444 msrs[n++].index = MSR_IA32_SYSENTER_ESP;
445 msrs[n++].index = MSR_IA32_SYSENTER_EIP;
446 if (kvm_has_msr_star)
447 msrs[n++].index = MSR_STAR;
448 msrs[n++].index = MSR_IA32_TSC;
449 #ifdef TARGET_X86_64
450 if (lm_capable_kernel) {
451 msrs[n++].index = MSR_CSTAR;
452 msrs[n++].index = MSR_KERNELGSBASE;
453 msrs[n++].index = MSR_FMASK;
454 msrs[n++].index = MSR_LSTAR;
456 #endif
457 rc = kvm_get_msrs(kvm_context, env->cpu_index, msrs, n);
458 if (rc == -1) {
459 perror("kvm_get_msrs FAILED");
461 else {
462 n = rc; /* actual number of MSRs */
463 for (i=0 ; i<n; i++) {
464 if (get_msr_entry(&msrs[i], env))
465 return;
470 #include <signal.h>
473 static int try_push_interrupts(void *opaque)
475 CPUState *env = cpu_single_env;
476 int r, irq;
478 if (env->ready_for_interrupt_injection &&
479 (env->interrupt_request & CPU_INTERRUPT_HARD) &&
480 (env->eflags & IF_MASK)) {
481 env->interrupt_request &= ~CPU_INTERRUPT_HARD;
482 irq = cpu_get_pic_interrupt(env);
483 if (irq >= 0) {
484 r = kvm_inject_irq(kvm_context, env->cpu_index, irq);
485 if (r < 0)
486 printf("cpu %d fail inject %x\n", env->cpu_index, irq);
490 return (env->interrupt_request & CPU_INTERRUPT_HARD) != 0;
493 static void post_kvm_run(void *opaque, int vcpu)
495 CPUState *env = vcpu_env;
497 pthread_mutex_lock(&qemu_mutex);
498 cpu_single_env = env;
499 env->eflags = kvm_get_interrupt_flag(kvm_context, vcpu)
500 ? env->eflags | IF_MASK : env->eflags & ~IF_MASK;
501 env->ready_for_interrupt_injection
502 = kvm_is_ready_for_interrupt_injection(kvm_context, vcpu);
504 cpu_set_apic_tpr(env, kvm_get_cr8(kvm_context, vcpu));
505 cpu_set_apic_base(env, kvm_get_apic_base(kvm_context, vcpu));
508 static int pre_kvm_run(void *opaque, int vcpu)
510 CPUState *env = cpu_single_env;
512 if (env->cpu_index == 0 && wait_hack) {
513 int i;
515 wait_hack = 0;
517 pthread_mutex_unlock(&qemu_mutex);
518 for (i = 0; i < 10; ++i)
519 usleep(1000);
520 pthread_mutex_lock(&qemu_mutex);
523 if (!kvm_irqchip_in_kernel(kvm_context))
524 kvm_set_cr8(kvm_context, vcpu, cpu_get_apic_tpr(env));
525 if (env->interrupt_request & CPU_INTERRUPT_EXIT)
526 return 1;
527 pthread_mutex_unlock(&qemu_mutex);
528 return 0;
531 void kvm_load_registers(CPUState *env)
533 if (kvm_allowed)
534 load_regs(env);
537 void kvm_save_registers(CPUState *env)
539 if (kvm_allowed)
540 save_regs(env);
543 int kvm_cpu_exec(CPUState *env)
545 int r;
547 r = kvm_run(kvm_context, env->cpu_index);
548 if (r < 0) {
549 printf("kvm_run returned %d\n", r);
550 exit(1);
553 return 0;
556 extern int vm_running;
558 static int has_work(CPUState *env)
560 if (!vm_running)
561 return 0;
562 if (!(env->hflags & HF_HALTED_MASK))
563 return 1;
564 if ((env->interrupt_request & (CPU_INTERRUPT_HARD | CPU_INTERRUPT_EXIT)) &&
565 (env->eflags & IF_MASK))
566 return 1;
567 return 0;
570 static int kvm_eat_signal(CPUState *env, int timeout)
572 struct timespec ts;
573 int r, e, ret = 0;
574 siginfo_t siginfo;
575 struct sigaction sa;
577 ts.tv_sec = timeout / 1000;
578 ts.tv_nsec = (timeout % 1000) * 1000000;
579 r = sigtimedwait(&io_sigset, &siginfo, &ts);
580 if (r == -1 && (errno == EAGAIN || errno == EINTR) && !timeout)
581 return 0;
582 e = errno;
583 pthread_mutex_lock(&qemu_mutex);
584 cpu_single_env = vcpu_env;
585 if (r == -1 && !(errno == EAGAIN || errno == EINTR)) {
586 printf("sigtimedwait: %s\n", strerror(e));
587 exit(1);
589 if (r != -1) {
590 sigaction(siginfo.si_signo, NULL, &sa);
591 sa.sa_handler(siginfo.si_signo);
592 ret = 1;
594 pthread_mutex_unlock(&qemu_mutex);
596 return ret;
600 static void kvm_eat_signals(CPUState *env, int timeout)
602 int r = 0;
604 while (kvm_eat_signal(env, 0))
605 r = 1;
606 if (!r && timeout) {
607 r = kvm_eat_signal(env, timeout);
608 if (r)
609 while (kvm_eat_signal(env, 0))
613 * we call select() even if no signal was received, to account for
614 * for which there is no signal handler installed.
616 pthread_mutex_lock(&qemu_mutex);
617 cpu_single_env = vcpu_env;
618 main_loop_wait(0);
619 pthread_mutex_unlock(&qemu_mutex);
622 static void kvm_main_loop_wait(CPUState *env, int timeout)
624 pthread_mutex_unlock(&qemu_mutex);
625 if (env->cpu_index == 0)
626 kvm_eat_signals(env, timeout);
627 else {
628 if (!kvm_irqchip_in_kernel(kvm_context) &&
629 (timeout || vcpu_info[env->cpu_index].stopped)) {
630 sigset_t set;
631 int n;
633 paused:
634 sigemptyset(&set);
635 sigaddset(&set, SIG_IPI);
636 sigwait(&set, &n);
637 } else {
638 struct timespec ts;
639 siginfo_t siginfo;
640 sigset_t set;
642 ts.tv_sec = 0;
643 ts.tv_nsec = 0;
644 sigemptyset(&set);
645 sigaddset(&set, SIG_IPI);
646 sigtimedwait(&set, &siginfo, &ts);
648 if (vcpu_info[env->cpu_index].stop) {
649 vcpu_info[env->cpu_index].stop = 0;
650 vcpu_info[env->cpu_index].stopped = 1;
651 pthread_kill(vcpu_info[0].thread, SIG_IPI);
652 goto paused;
655 pthread_mutex_lock(&qemu_mutex);
656 cpu_single_env = env;
657 vcpu_info[env->cpu_index].signalled = 0;
660 static int all_threads_paused(void)
662 int i;
664 for (i = 1; i < smp_cpus; ++i)
665 if (vcpu_info[i].stopped)
666 return 0;
667 return 1;
670 static void pause_other_threads(void)
672 int i;
674 for (i = 1; i < smp_cpus; ++i) {
675 vcpu_info[i].stop = 1;
676 pthread_kill(vcpu_info[i].thread, SIG_IPI);
678 while (!all_threads_paused())
679 kvm_eat_signals(vcpu_env, 0);
682 static void resume_other_threads(void)
684 int i;
686 for (i = 1; i < smp_cpus; ++i) {
687 vcpu_info[i].stop = 0;
688 vcpu_info[i].stopped = 0;
689 pthread_kill(vcpu_info[i].thread, SIG_IPI);
693 static void kvm_vm_state_change_handler(void *context, int running)
695 if (running)
696 resume_other_threads();
697 else
698 pause_other_threads();
701 static void update_regs_for_sipi(CPUState *env)
703 SegmentCache cs = env->segs[R_CS];
705 save_regs(env);
706 env->segs[R_CS] = cs;
707 env->eip = 0;
708 load_regs(env);
709 vcpu_info[env->cpu_index].sipi_needed = 0;
710 vcpu_info[env->cpu_index].init = 0;
713 static void update_regs_for_init(CPUState *env)
715 cpu_reset(env);
716 load_regs(env);
719 static void setup_kernel_sigmask(CPUState *env)
721 sigset_t set;
723 sigprocmask(SIG_BLOCK, NULL, &set);
724 sigdelset(&set, SIG_IPI);
725 if (env->cpu_index == 0)
726 sigandset(&set, &set, &io_negsigset);
728 kvm_set_signal_mask(kvm_context, env->cpu_index, &set);
731 static int kvm_main_loop_cpu(CPUState *env)
733 struct vcpu_info *info = &vcpu_info[env->cpu_index];
735 setup_kernel_sigmask(env);
736 pthread_mutex_lock(&qemu_mutex);
737 cpu_single_env = env;
738 while (1) {
739 while (!has_work(env))
740 kvm_main_loop_wait(env, 10);
741 if (env->interrupt_request & CPU_INTERRUPT_HARD)
742 env->hflags &= ~HF_HALTED_MASK;
743 if (!kvm_irqchip_in_kernel(kvm_context) && info->sipi_needed)
744 update_regs_for_sipi(env);
745 if (!kvm_irqchip_in_kernel(kvm_context) && info->init)
746 update_regs_for_init(env);
747 if (!(env->hflags & HF_HALTED_MASK) && !info->init)
748 kvm_cpu_exec(env);
749 env->interrupt_request &= ~CPU_INTERRUPT_EXIT;
750 kvm_main_loop_wait(env, 0);
751 if (qemu_shutdown_requested())
752 break;
753 else if (qemu_powerdown_requested())
754 qemu_system_powerdown();
755 else if (qemu_reset_requested()) {
756 env->interrupt_request = 0;
757 qemu_system_reset();
758 load_regs(env);
761 pthread_mutex_unlock(&qemu_mutex);
762 return 0;
765 static void *ap_main_loop(void *_env)
767 CPUState *env = _env;
768 sigset_t signals;
770 vcpu_env = env;
771 sigfillset(&signals);
772 //sigdelset(&signals, SIG_IPI);
773 sigprocmask(SIG_BLOCK, &signals, NULL);
774 kvm_create_vcpu(kvm_context, env->cpu_index);
775 kvm_qemu_init_env(env);
776 if (kvm_irqchip_in_kernel(kvm_context))
777 env->hflags &= ~HF_HALTED_MASK;
778 kvm_main_loop_cpu(env);
779 return NULL;
782 static void kvm_add_signal(int signum)
784 sigaddset(&io_sigset, signum);
785 sigdelset(&io_negsigset, signum);
786 sigprocmask(SIG_BLOCK, &io_sigset, NULL);
789 int kvm_init_ap(void)
791 CPUState *env = first_cpu->next_cpu;
792 int i;
794 qemu_add_vm_change_state_handler(kvm_vm_state_change_handler, NULL);
795 sigemptyset(&io_sigset);
796 sigfillset(&io_negsigset);
797 kvm_add_signal(SIGIO);
798 kvm_add_signal(SIGALRM);
799 kvm_add_signal(SIGUSR2);
800 if (!kvm_irqchip_in_kernel(kvm_context))
801 kvm_add_signal(SIG_IPI);
803 vcpu_env = first_cpu;
804 signal(SIG_IPI, sig_ipi_handler);
805 for (i = 1; i < smp_cpus; ++i) {
806 pthread_create(&vcpu_info[i].thread, NULL, ap_main_loop, env);
807 env = env->next_cpu;
809 return 0;
812 int kvm_main_loop(void)
814 vcpu_info[0].thread = pthread_self();
815 return kvm_main_loop_cpu(first_cpu);
818 static int kvm_debug(void *opaque, int vcpu)
820 CPUState *env = cpu_single_env;
822 env->exception_index = EXCP_DEBUG;
823 return 1;
826 static int kvm_inb(void *opaque, uint16_t addr, uint8_t *data)
828 *data = cpu_inb(0, addr);
829 return 0;
832 static int kvm_inw(void *opaque, uint16_t addr, uint16_t *data)
834 *data = cpu_inw(0, addr);
835 return 0;
838 static int kvm_inl(void *opaque, uint16_t addr, uint32_t *data)
840 *data = cpu_inl(0, addr);
841 return 0;
844 #define PM_IO_BASE 0xb000
846 static int kvm_outb(void *opaque, uint16_t addr, uint8_t data)
848 if (addr == 0xb2) {
849 switch (data) {
850 case 0: {
851 cpu_outb(0, 0xb3, 0);
852 break;
854 case 0xf0: {
855 unsigned x;
857 /* enable acpi */
858 x = cpu_inw(0, PM_IO_BASE + 4);
859 x &= ~1;
860 cpu_outw(0, PM_IO_BASE + 4, x);
861 break;
863 case 0xf1: {
864 unsigned x;
866 /* enable acpi */
867 x = cpu_inw(0, PM_IO_BASE + 4);
868 x |= 1;
869 cpu_outw(0, PM_IO_BASE + 4, x);
870 break;
872 default:
873 break;
875 return 0;
877 cpu_outb(0, addr, data);
878 return 0;
881 static int kvm_outw(void *opaque, uint16_t addr, uint16_t data)
883 cpu_outw(0, addr, data);
884 return 0;
887 static int kvm_outl(void *opaque, uint16_t addr, uint32_t data)
889 cpu_outl(0, addr, data);
890 return 0;
893 static int kvm_readb(void *opaque, uint64_t addr, uint8_t *data)
895 *data = ldub_phys(addr);
896 return 0;
899 static int kvm_readw(void *opaque, uint64_t addr, uint16_t *data)
901 *data = lduw_phys(addr);
902 return 0;
905 static int kvm_readl(void *opaque, uint64_t addr, uint32_t *data)
907 /* hack: Red Hat 7.1 generates some wierd accesses. */
908 if (addr > 0xa0000 - 4 && addr < 0xa0000) {
909 *data = 0;
910 return 0;
913 *data = ldl_phys(addr);
914 return 0;
917 static int kvm_readq(void *opaque, uint64_t addr, uint64_t *data)
919 *data = ldq_phys(addr);
920 return 0;
923 static int kvm_writeb(void *opaque, uint64_t addr, uint8_t data)
925 stb_phys(addr, data);
926 return 0;
929 static int kvm_writew(void *opaque, uint64_t addr, uint16_t data)
931 stw_phys(addr, data);
932 return 0;
935 static int kvm_writel(void *opaque, uint64_t addr, uint32_t data)
937 stl_phys(addr, data);
938 return 0;
941 static int kvm_writeq(void *opaque, uint64_t addr, uint64_t data)
943 stq_phys(addr, data);
944 return 0;
947 static int kvm_io_window(void *opaque)
949 return 1;
953 static int kvm_halt(void *opaque, int vcpu)
955 CPUState *env = cpu_single_env;
957 if (!((env->interrupt_request & CPU_INTERRUPT_HARD) &&
958 (env->eflags & IF_MASK))) {
959 env->hflags |= HF_HALTED_MASK;
960 env->exception_index = EXCP_HLT;
963 return 1;
966 static int kvm_shutdown(void *opaque, int vcpu)
968 qemu_system_reset_request();
969 return 1;
972 static struct kvm_callbacks qemu_kvm_ops = {
973 .debug = kvm_debug,
974 .inb = kvm_inb,
975 .inw = kvm_inw,
976 .inl = kvm_inl,
977 .outb = kvm_outb,
978 .outw = kvm_outw,
979 .outl = kvm_outl,
980 .readb = kvm_readb,
981 .readw = kvm_readw,
982 .readl = kvm_readl,
983 .readq = kvm_readq,
984 .writeb = kvm_writeb,
985 .writew = kvm_writew,
986 .writel = kvm_writel,
987 .writeq = kvm_writeq,
988 .halt = kvm_halt,
989 .shutdown = kvm_shutdown,
990 .io_window = kvm_io_window,
991 .try_push_interrupts = try_push_interrupts,
992 .post_kvm_run = post_kvm_run,
993 .pre_kvm_run = pre_kvm_run,
996 int kvm_qemu_init()
998 /* Try to initialize kvm */
999 kvm_context = kvm_init(&qemu_kvm_ops, cpu_single_env);
1000 if (!kvm_context) {
1001 return -1;
1004 return 0;
1007 int kvm_qemu_create_context(void)
1009 int i;
1011 if (!kvm_irqchip) {
1012 kvm_disable_irqchip_creation(kvm_context);
1014 if (kvm_create(kvm_context, phys_ram_size, (void**)&phys_ram_base) < 0) {
1015 kvm_qemu_destroy();
1016 return -1;
1018 if (kvm_shadow_memory)
1019 kvm_set_shadow_pages(kvm_context, kvm_shadow_memory);
1020 kvm_msr_list = kvm_get_msr_list(kvm_context);
1021 if (!kvm_msr_list) {
1022 kvm_qemu_destroy();
1023 return -1;
1025 for (i = 0; i < kvm_msr_list->nmsrs; ++i)
1026 if (kvm_msr_list->indices[i] == MSR_STAR)
1027 kvm_has_msr_star = 1;
1028 return 0;
1031 void kvm_qemu_destroy(void)
1033 kvm_finalize(kvm_context);
1036 void kvm_cpu_register_physical_memory(target_phys_addr_t start_addr,
1037 unsigned long size,
1038 unsigned long phys_offset)
1040 #ifdef KVM_CAP_USER_MEMORY
1041 int r = 0;
1043 r = kvm_check_extension(kvm_context, KVM_CAP_USER_MEMORY);
1044 if (r) {
1045 if (!(phys_offset & ~TARGET_PAGE_MASK)) {
1046 r = kvm_is_allocated_mem(kvm_context, start_addr, size);
1047 if (r)
1048 return;
1049 r = kvm_is_intersecting_mem(kvm_context, start_addr);
1050 if (r)
1051 kvm_create_mem_hole(kvm_context, start_addr, size);
1052 r = kvm_register_userspace_phys_mem(kvm_context, start_addr,
1053 phys_ram_base + phys_offset,
1054 size, 1);
1056 if (phys_offset & IO_MEM_ROM) {
1057 phys_offset &= ~IO_MEM_ROM;
1058 r = kvm_is_intersecting_mem(kvm_context, start_addr);
1059 if (r)
1060 kvm_create_mem_hole(kvm_context, start_addr, size);
1061 r = kvm_register_userspace_phys_mem(kvm_context, start_addr,
1062 phys_ram_base + phys_offset,
1063 size, 1);
1065 if (r < 0) {
1066 printf("kvm_cpu_register_physical_memory: failed\n");
1067 exit(1);
1069 return;
1071 #endif
1072 if (phys_offset & IO_MEM_ROM) {
1073 phys_offset &= ~IO_MEM_ROM;
1074 memcpy(phys_ram_base + start_addr, phys_ram_base + phys_offset, size);
1078 int kvm_qemu_check_extension(int ext)
1080 return kvm_check_extension(kvm_context, ext);
1083 static void host_cpuid(uint32_t function, uint32_t *eax, uint32_t *ebx,
1084 uint32_t *ecx, uint32_t *edx)
1086 uint32_t vec[4];
1088 vec[0] = function;
1089 asm volatile (
1090 #ifdef __x86_64__
1091 "sub $128, %%rsp \n\t" /* skip red zone */
1092 "push %0; push %%rsi \n\t"
1093 "push %%rax; push %%rbx; push %%rcx; push %%rdx \n\t"
1094 "mov 8*5(%%rsp), %%rsi \n\t"
1095 "mov (%%rsi), %%eax \n\t"
1096 "cpuid \n\t"
1097 "mov %%eax, (%%rsi) \n\t"
1098 "mov %%ebx, 4(%%rsi) \n\t"
1099 "mov %%ecx, 8(%%rsi) \n\t"
1100 "mov %%edx, 12(%%rsi) \n\t"
1101 "pop %%rdx; pop %%rcx; pop %%rbx; pop %%rax \n\t"
1102 "pop %%rsi; pop %0 \n\t"
1103 "add $128, %%rsp"
1104 #else
1105 "push %0; push %%esi \n\t"
1106 "push %%eax; push %%ebx; push %%ecx; push %%edx \n\t"
1107 "mov 4*5(%%esp), %%esi \n\t"
1108 "mov (%%esi), %%eax \n\t"
1109 "cpuid \n\t"
1110 "mov %%eax, (%%esi) \n\t"
1111 "mov %%ebx, 4(%%esi) \n\t"
1112 "mov %%ecx, 8(%%esi) \n\t"
1113 "mov %%edx, 12(%%esi) \n\t"
1114 "pop %%edx; pop %%ecx; pop %%ebx; pop %%eax \n\t"
1115 "pop %%esi; pop %0 \n\t"
1116 #endif
1117 : : "rm"(vec) : "memory");
1118 if (eax)
1119 *eax = vec[0];
1120 if (ebx)
1121 *ebx = vec[1];
1122 if (ecx)
1123 *ecx = vec[2];
1124 if (edx)
1125 *edx = vec[3];
1128 static void do_cpuid_ent(struct kvm_cpuid_entry *e, uint32_t function,
1129 CPUState *env)
1131 env->regs[R_EAX] = function;
1132 qemu_kvm_cpuid_on_env(env);
1133 e->function = function;
1134 e->eax = env->regs[R_EAX];
1135 e->ebx = env->regs[R_EBX];
1136 e->ecx = env->regs[R_ECX];
1137 e->edx = env->regs[R_EDX];
1138 if (function == 0x80000001) {
1139 uint32_t h_eax, h_edx;
1140 struct utsname utsname;
1142 host_cpuid(function, &h_eax, NULL, NULL, &h_edx);
1143 uname(&utsname);
1144 lm_capable_kernel = strcmp(utsname.machine, "x86_64") == 0;
1146 // long mode
1147 if ((h_edx & 0x20000000) == 0 || !lm_capable_kernel)
1148 e->edx &= ~0x20000000u;
1149 // syscall
1150 if ((h_edx & 0x00000800) == 0)
1151 e->edx &= ~0x00000800u;
1152 // nx
1153 if ((h_edx & 0x00100000) == 0)
1154 e->edx &= ~0x00100000u;
1155 // svm
1156 if (e->ecx & 4)
1157 e->ecx &= ~4u;
1159 // sysenter isn't supported on compatibility mode on AMD. and syscall
1160 // isn't supported in compatibility mode on Intel. so advertise the
1161 // actuall cpu, and say goodbye to migration between different vendors
1162 // is you use compatibility mode.
1163 if (function == 0) {
1164 uint32_t bcd[3];
1166 host_cpuid(0, NULL, &bcd[0], &bcd[1], &bcd[2]);
1167 e->ebx = bcd[0];
1168 e->ecx = bcd[1];
1169 e->edx = bcd[2];
1173 int kvm_qemu_init_env(CPUState *cenv)
1175 struct kvm_cpuid_entry cpuid_ent[100];
1176 #ifdef KVM_CPUID_SIGNATURE
1177 struct kvm_cpuid_entry *pv_ent;
1178 uint32_t signature[3];
1179 #endif
1180 int cpuid_nent = 0;
1181 CPUState copy;
1182 uint32_t i, limit;
1184 copy = *cenv;
1186 #ifdef KVM_CPUID_SIGNATURE
1187 /* Paravirtualization CPUIDs */
1188 memcpy(signature, "KVMKVMKVM", 12);
1189 pv_ent = &cpuid_ent[cpuid_nent++];
1190 memset(pv_ent, 0, sizeof(*pv_ent));
1191 pv_ent->function = KVM_CPUID_SIGNATURE;
1192 pv_ent->eax = 0;
1193 pv_ent->ebx = signature[0];
1194 pv_ent->ecx = signature[1];
1195 pv_ent->edx = signature[2];
1197 pv_ent = &cpuid_ent[cpuid_nent++];
1198 memset(pv_ent, 0, sizeof(*pv_ent));
1199 pv_ent->function = KVM_CPUID_FEATURES;
1200 pv_ent->eax = 0;
1201 #endif
1203 copy.regs[R_EAX] = 0;
1204 qemu_kvm_cpuid_on_env(&copy);
1205 limit = copy.regs[R_EAX];
1207 for (i = 0; i <= limit; ++i)
1208 do_cpuid_ent(&cpuid_ent[cpuid_nent++], i, &copy);
1210 copy.regs[R_EAX] = 0x80000000;
1211 qemu_kvm_cpuid_on_env(&copy);
1212 limit = copy.regs[R_EAX];
1214 for (i = 0x80000000; i <= limit; ++i)
1215 do_cpuid_ent(&cpuid_ent[cpuid_nent++], i, &copy);
1217 kvm_setup_cpuid(kvm_context, cenv->cpu_index, cpuid_nent, cpuid_ent);
1219 return 0;
1222 int kvm_update_debugger(CPUState *env)
1224 struct kvm_debug_guest dbg;
1225 int i;
1227 dbg.enabled = 0;
1228 if (env->nb_breakpoints || env->singlestep_enabled) {
1229 dbg.enabled = 1;
1230 for (i = 0; i < 4 && i < env->nb_breakpoints; ++i) {
1231 dbg.breakpoints[i].enabled = 1;
1232 dbg.breakpoints[i].address = env->breakpoints[i];
1234 dbg.singlestep = env->singlestep_enabled;
1236 return kvm_guest_debug(kvm_context, env->cpu_index, &dbg);
1241 * dirty pages logging
1243 /* FIXME: use unsigned long pointer instead of unsigned char */
1244 unsigned char *kvm_dirty_bitmap = NULL;
1245 int kvm_physical_memory_set_dirty_tracking(int enable)
1247 int r = 0;
1249 if (!kvm_allowed)
1250 return 0;
1252 if (enable) {
1253 if (!kvm_dirty_bitmap) {
1254 unsigned bitmap_size = BITMAP_SIZE(phys_ram_size);
1255 kvm_dirty_bitmap = qemu_malloc(bitmap_size);
1256 if (kvm_dirty_bitmap == NULL) {
1257 perror("Failed to allocate dirty pages bitmap");
1258 r=-1;
1260 else {
1261 r = kvm_dirty_pages_log_enable_all(kvm_context);
1265 else {
1266 if (kvm_dirty_bitmap) {
1267 r = kvm_dirty_pages_log_reset(kvm_context);
1268 qemu_free(kvm_dirty_bitmap);
1269 kvm_dirty_bitmap = NULL;
1272 return r;
1275 /* get kvm's dirty pages bitmap and update qemu's */
1276 int kvm_get_dirty_pages_log_slot(unsigned long start_addr,
1277 unsigned char *bitmap,
1278 unsigned int offset,
1279 unsigned int len)
1281 int r;
1282 unsigned int i, j, n=0;
1283 unsigned char c;
1284 unsigned page_number, addr, addr1;
1286 memset(bitmap, 0, len);
1287 r = kvm_get_dirty_pages(kvm_context, start_addr, bitmap);
1288 if (r)
1289 return r;
1292 * bitmap-traveling is faster than memory-traveling (for addr...)
1293 * especially when most of the memory is not dirty.
1295 for (i=0; i<len; i++) {
1296 c = bitmap[i];
1297 while (c>0) {
1298 j = ffsl(c) - 1;
1299 c &= ~(1u<<j);
1300 page_number = i * 8 + j;
1301 addr1 = page_number * TARGET_PAGE_SIZE;
1302 addr = offset + addr1;
1303 cpu_physical_memory_set_dirty(addr);
1304 n++;
1307 return 0;
1311 * get kvm's dirty pages bitmap and update qemu's
1312 * we only care about physical ram, which resides in slots 0 and 3
1314 int kvm_update_dirty_pages_log(void)
1316 int r = 0, len;
1318 len = BITMAP_SIZE(0xa0000);
1319 r = kvm_get_dirty_pages_log_slot(0, kvm_dirty_bitmap, 0 , len);
1320 len = BITMAP_SIZE(phys_ram_size - 0xc0000);
1321 r = r || kvm_get_dirty_pages_log_slot(0xc0000, kvm_dirty_bitmap, 0xc0000, len);
1322 return r;
1325 int kvm_get_phys_ram_page_bitmap(unsigned char *bitmap)
1327 int r=0, len, offset;
1329 len = BITMAP_SIZE(phys_ram_size);
1330 memset(bitmap, 0, len);
1332 r = kvm_get_mem_map(kvm_context, 0, bitmap);
1333 if (r)
1334 goto out;
1336 offset = BITMAP_SIZE(0xc0000);
1337 r = kvm_get_mem_map(kvm_context, 0xc0000, bitmap + offset);
1339 out:
1340 return r;
1343 #ifdef KVM_CAP_IRQCHIP
1345 int kvm_set_irq(int irq, int level)
1347 return kvm_set_irq_level(kvm_context, irq, level);
1350 #endif
1352 #endif