Make qemu_ram_alloc() be able to allocate more than 4GB of memory
[qemu-kvm/fedora.git] / qemu-kvm.c
blob38ba5c3da2f8fe2e324077a219b5ca29196144ed
2 #include "config.h"
3 #include "config-host.h"
5 #ifdef USE_KVM
6 #define KVM_ALLOWED_DEFAULT 1
7 #else
8 #define KVM_ALLOWED_DEFAULT 0
9 #endif
11 int kvm_allowed = KVM_ALLOWED_DEFAULT;
12 static int lm_capable_kernel;
13 int kvm_irqchip = 1;
15 #ifdef USE_KVM
17 #include <string.h>
18 #include "vl.h"
20 #include "qemu-kvm.h"
21 #include <libkvm.h>
22 #include <pthread.h>
23 #include <sys/utsname.h>
25 #define MSR_IA32_TSC 0x10
27 extern void perror(const char *s);
29 kvm_context_t kvm_context;
30 static struct kvm_msr_list *kvm_msr_list;
31 static int kvm_has_msr_star;
33 extern int smp_cpus;
34 extern unsigned int kvm_shadow_memory;
36 pthread_mutex_t qemu_mutex = PTHREAD_MUTEX_INITIALIZER;
37 static __thread CPUState *vcpu_env;
39 static sigset_t io_sigset, io_negsigset;
41 static int wait_hack;
43 #define SIG_IPI (SIGRTMIN+4)
45 struct vcpu_info {
46 int sipi_needed;
47 int init;
48 pthread_t thread;
49 int signalled;
50 int stop;
51 int stopped;
52 } vcpu_info[4];
54 static void sig_ipi_handler(int n)
58 void kvm_update_interrupt_request(CPUState *env)
60 if (env && env != vcpu_env) {
61 if (vcpu_info[env->cpu_index].signalled)
62 return;
63 vcpu_info[env->cpu_index].signalled = 1;
64 if (vcpu_info[env->cpu_index].thread)
65 pthread_kill(vcpu_info[env->cpu_index].thread, SIG_IPI);
69 void kvm_update_after_sipi(CPUState *env)
71 vcpu_info[env->cpu_index].sipi_needed = 1;
72 kvm_update_interrupt_request(env);
75 * the qemu bios waits using a busy loop that's much too short for
76 * kvm. add a wait after the first sipi.
79 static int first_sipi = 1;
81 if (first_sipi) {
82 wait_hack = 1;
83 first_sipi = 0;
88 void kvm_apic_init(CPUState *env)
90 if (env->cpu_index != 0)
91 vcpu_info[env->cpu_index].init = 1;
92 kvm_update_interrupt_request(env);
95 static void set_msr_entry(struct kvm_msr_entry *entry, uint32_t index,
96 uint64_t data)
98 entry->index = index;
99 entry->data = data;
102 /* returns 0 on success, non-0 on failure */
103 static int get_msr_entry(struct kvm_msr_entry *entry, CPUState *env)
105 switch (entry->index) {
106 case MSR_IA32_SYSENTER_CS:
107 env->sysenter_cs = entry->data;
108 break;
109 case MSR_IA32_SYSENTER_ESP:
110 env->sysenter_esp = entry->data;
111 break;
112 case MSR_IA32_SYSENTER_EIP:
113 env->sysenter_eip = entry->data;
114 break;
115 case MSR_STAR:
116 env->star = entry->data;
117 break;
118 #ifdef TARGET_X86_64
119 case MSR_CSTAR:
120 env->cstar = entry->data;
121 break;
122 case MSR_KERNELGSBASE:
123 env->kernelgsbase = entry->data;
124 break;
125 case MSR_FMASK:
126 env->fmask = entry->data;
127 break;
128 case MSR_LSTAR:
129 env->lstar = entry->data;
130 break;
131 #endif
132 case MSR_IA32_TSC:
133 env->tsc = entry->data;
134 break;
135 default:
136 printf("Warning unknown msr index 0x%x\n", entry->index);
137 return 1;
139 return 0;
142 #ifdef TARGET_X86_64
143 #define MSR_COUNT 9
144 #else
145 #define MSR_COUNT 5
146 #endif
148 static void set_v8086_seg(struct kvm_segment *lhs, const SegmentCache *rhs)
150 lhs->selector = rhs->selector;
151 lhs->base = rhs->base;
152 lhs->limit = rhs->limit;
153 lhs->type = 3;
154 lhs->present = 1;
155 lhs->dpl = 3;
156 lhs->db = 0;
157 lhs->s = 1;
158 lhs->l = 0;
159 lhs->g = 0;
160 lhs->avl = 0;
161 lhs->unusable = 0;
164 static void set_seg(struct kvm_segment *lhs, const SegmentCache *rhs)
166 unsigned flags = rhs->flags;
167 lhs->selector = rhs->selector;
168 lhs->base = rhs->base;
169 lhs->limit = rhs->limit;
170 lhs->type = (flags >> DESC_TYPE_SHIFT) & 15;
171 lhs->present = (flags & DESC_P_MASK) != 0;
172 lhs->dpl = rhs->selector & 3;
173 lhs->db = (flags >> DESC_B_SHIFT) & 1;
174 lhs->s = (flags & DESC_S_MASK) != 0;
175 lhs->l = (flags >> DESC_L_SHIFT) & 1;
176 lhs->g = (flags & DESC_G_MASK) != 0;
177 lhs->avl = (flags & DESC_AVL_MASK) != 0;
178 lhs->unusable = 0;
181 static void get_seg(SegmentCache *lhs, const struct kvm_segment *rhs)
183 lhs->selector = rhs->selector;
184 lhs->base = rhs->base;
185 lhs->limit = rhs->limit;
186 lhs->flags =
187 (rhs->type << DESC_TYPE_SHIFT)
188 | (rhs->present * DESC_P_MASK)
189 | (rhs->dpl << DESC_DPL_SHIFT)
190 | (rhs->db << DESC_B_SHIFT)
191 | (rhs->s * DESC_S_MASK)
192 | (rhs->l << DESC_L_SHIFT)
193 | (rhs->g * DESC_G_MASK)
194 | (rhs->avl * DESC_AVL_MASK);
197 /* the reset values of qemu are not compatible to SVM
198 * this function is used to fix the segment descriptor values */
199 static void fix_realmode_dataseg(struct kvm_segment *seg)
201 seg->type = 0x02;
202 seg->present = 1;
203 seg->s = 1;
206 static void load_regs(CPUState *env)
208 struct kvm_regs regs;
209 struct kvm_fpu fpu;
210 struct kvm_sregs sregs;
211 struct kvm_msr_entry msrs[MSR_COUNT];
212 int rc, n, i;
214 regs.rax = env->regs[R_EAX];
215 regs.rbx = env->regs[R_EBX];
216 regs.rcx = env->regs[R_ECX];
217 regs.rdx = env->regs[R_EDX];
218 regs.rsi = env->regs[R_ESI];
219 regs.rdi = env->regs[R_EDI];
220 regs.rsp = env->regs[R_ESP];
221 regs.rbp = env->regs[R_EBP];
222 #ifdef TARGET_X86_64
223 regs.r8 = env->regs[8];
224 regs.r9 = env->regs[9];
225 regs.r10 = env->regs[10];
226 regs.r11 = env->regs[11];
227 regs.r12 = env->regs[12];
228 regs.r13 = env->regs[13];
229 regs.r14 = env->regs[14];
230 regs.r15 = env->regs[15];
231 #endif
233 regs.rflags = env->eflags;
234 regs.rip = env->eip;
236 kvm_set_regs(kvm_context, env->cpu_index, &regs);
238 memset(&fpu, 0, sizeof fpu);
239 fpu.fsw = env->fpus & ~(7 << 11);
240 fpu.fsw |= (env->fpstt & 7) << 11;
241 fpu.fcw = env->fpuc;
242 for (i = 0; i < 8; ++i)
243 fpu.ftwx |= (!env->fptags[i]) << i;
244 memcpy(fpu.fpr, env->fpregs, sizeof env->fpregs);
245 memcpy(fpu.xmm, env->xmm_regs, sizeof env->xmm_regs);
246 fpu.mxcsr = env->mxcsr;
247 kvm_set_fpu(kvm_context, env->cpu_index, &fpu);
249 memcpy(sregs.interrupt_bitmap, env->kvm_interrupt_bitmap, sizeof(sregs.interrupt_bitmap));
251 if ((env->eflags & VM_MASK)) {
252 set_v8086_seg(&sregs.cs, &env->segs[R_CS]);
253 set_v8086_seg(&sregs.ds, &env->segs[R_DS]);
254 set_v8086_seg(&sregs.es, &env->segs[R_ES]);
255 set_v8086_seg(&sregs.fs, &env->segs[R_FS]);
256 set_v8086_seg(&sregs.gs, &env->segs[R_GS]);
257 set_v8086_seg(&sregs.ss, &env->segs[R_SS]);
258 } else {
259 set_seg(&sregs.cs, &env->segs[R_CS]);
260 set_seg(&sregs.ds, &env->segs[R_DS]);
261 set_seg(&sregs.es, &env->segs[R_ES]);
262 set_seg(&sregs.fs, &env->segs[R_FS]);
263 set_seg(&sregs.gs, &env->segs[R_GS]);
264 set_seg(&sregs.ss, &env->segs[R_SS]);
266 if (env->cr[0] & CR0_PE_MASK) {
267 /* force ss cpl to cs cpl */
268 sregs.ss.selector = (sregs.ss.selector & ~3) |
269 (sregs.cs.selector & 3);
270 sregs.ss.dpl = sregs.ss.selector & 3;
273 if (!(env->cr[0] & CR0_PG_MASK)) {
274 fix_realmode_dataseg(&sregs.cs);
275 fix_realmode_dataseg(&sregs.ds);
276 fix_realmode_dataseg(&sregs.es);
277 fix_realmode_dataseg(&sregs.fs);
278 fix_realmode_dataseg(&sregs.gs);
279 fix_realmode_dataseg(&sregs.ss);
283 set_seg(&sregs.tr, &env->tr);
284 set_seg(&sregs.ldt, &env->ldt);
286 sregs.idt.limit = env->idt.limit;
287 sregs.idt.base = env->idt.base;
288 sregs.gdt.limit = env->gdt.limit;
289 sregs.gdt.base = env->gdt.base;
291 sregs.cr0 = env->cr[0];
292 sregs.cr2 = env->cr[2];
293 sregs.cr3 = env->cr[3];
294 sregs.cr4 = env->cr[4];
296 sregs.apic_base = cpu_get_apic_base(env);
297 sregs.efer = env->efer;
298 sregs.cr8 = cpu_get_apic_tpr(env);
300 kvm_set_sregs(kvm_context, env->cpu_index, &sregs);
302 /* msrs */
303 n = 0;
304 set_msr_entry(&msrs[n++], MSR_IA32_SYSENTER_CS, env->sysenter_cs);
305 set_msr_entry(&msrs[n++], MSR_IA32_SYSENTER_ESP, env->sysenter_esp);
306 set_msr_entry(&msrs[n++], MSR_IA32_SYSENTER_EIP, env->sysenter_eip);
307 if (kvm_has_msr_star)
308 set_msr_entry(&msrs[n++], MSR_STAR, env->star);
309 set_msr_entry(&msrs[n++], MSR_IA32_TSC, env->tsc);
310 #ifdef TARGET_X86_64
311 if (lm_capable_kernel) {
312 set_msr_entry(&msrs[n++], MSR_CSTAR, env->cstar);
313 set_msr_entry(&msrs[n++], MSR_KERNELGSBASE, env->kernelgsbase);
314 set_msr_entry(&msrs[n++], MSR_FMASK, env->fmask);
315 set_msr_entry(&msrs[n++], MSR_LSTAR , env->lstar);
317 #endif
319 rc = kvm_set_msrs(kvm_context, env->cpu_index, msrs, n);
320 if (rc == -1)
321 perror("kvm_set_msrs FAILED");
325 static void save_regs(CPUState *env)
327 struct kvm_regs regs;
328 struct kvm_fpu fpu;
329 struct kvm_sregs sregs;
330 struct kvm_msr_entry msrs[MSR_COUNT];
331 uint32_t hflags;
332 uint32_t i, n, rc;
334 kvm_get_regs(kvm_context, env->cpu_index, &regs);
336 env->regs[R_EAX] = regs.rax;
337 env->regs[R_EBX] = regs.rbx;
338 env->regs[R_ECX] = regs.rcx;
339 env->regs[R_EDX] = regs.rdx;
340 env->regs[R_ESI] = regs.rsi;
341 env->regs[R_EDI] = regs.rdi;
342 env->regs[R_ESP] = regs.rsp;
343 env->regs[R_EBP] = regs.rbp;
344 #ifdef TARGET_X86_64
345 env->regs[8] = regs.r8;
346 env->regs[9] = regs.r9;
347 env->regs[10] = regs.r10;
348 env->regs[11] = regs.r11;
349 env->regs[12] = regs.r12;
350 env->regs[13] = regs.r13;
351 env->regs[14] = regs.r14;
352 env->regs[15] = regs.r15;
353 #endif
355 env->eflags = regs.rflags;
356 env->eip = regs.rip;
358 kvm_get_fpu(kvm_context, env->cpu_index, &fpu);
359 env->fpstt = (fpu.fsw >> 11) & 7;
360 env->fpus = fpu.fsw;
361 env->fpuc = fpu.fcw;
362 for (i = 0; i < 8; ++i)
363 env->fptags[i] = !((fpu.ftwx >> i) & 1);
364 memcpy(env->fpregs, fpu.fpr, sizeof env->fpregs);
365 memcpy(env->xmm_regs, fpu.xmm, sizeof env->xmm_regs);
366 env->mxcsr = fpu.mxcsr;
368 kvm_get_sregs(kvm_context, env->cpu_index, &sregs);
370 memcpy(env->kvm_interrupt_bitmap, sregs.interrupt_bitmap, sizeof(env->kvm_interrupt_bitmap));
372 get_seg(&env->segs[R_CS], &sregs.cs);
373 get_seg(&env->segs[R_DS], &sregs.ds);
374 get_seg(&env->segs[R_ES], &sregs.es);
375 get_seg(&env->segs[R_FS], &sregs.fs);
376 get_seg(&env->segs[R_GS], &sregs.gs);
377 get_seg(&env->segs[R_SS], &sregs.ss);
379 get_seg(&env->tr, &sregs.tr);
380 get_seg(&env->ldt, &sregs.ldt);
382 env->idt.limit = sregs.idt.limit;
383 env->idt.base = sregs.idt.base;
384 env->gdt.limit = sregs.gdt.limit;
385 env->gdt.base = sregs.gdt.base;
387 env->cr[0] = sregs.cr0;
388 env->cr[2] = sregs.cr2;
389 env->cr[3] = sregs.cr3;
390 env->cr[4] = sregs.cr4;
392 cpu_set_apic_base(env, sregs.apic_base);
394 env->efer = sregs.efer;
395 //cpu_set_apic_tpr(env, sregs.cr8);
397 #define HFLAG_COPY_MASK ~( \
398 HF_CPL_MASK | HF_PE_MASK | HF_MP_MASK | HF_EM_MASK | \
399 HF_TS_MASK | HF_TF_MASK | HF_VM_MASK | HF_IOPL_MASK | \
400 HF_OSFXSR_MASK | HF_LMA_MASK | HF_CS32_MASK | \
401 HF_SS32_MASK | HF_CS64_MASK | HF_ADDSEG_MASK)
405 hflags = (env->segs[R_CS].flags >> DESC_DPL_SHIFT) & HF_CPL_MASK;
406 hflags |= (env->cr[0] & CR0_PE_MASK) << (HF_PE_SHIFT - CR0_PE_SHIFT);
407 hflags |= (env->cr[0] << (HF_MP_SHIFT - CR0_MP_SHIFT)) &
408 (HF_MP_MASK | HF_EM_MASK | HF_TS_MASK);
409 hflags |= (env->eflags & (HF_TF_MASK | HF_VM_MASK | HF_IOPL_MASK));
410 hflags |= (env->cr[4] & CR4_OSFXSR_MASK) <<
411 (HF_OSFXSR_SHIFT - CR4_OSFXSR_SHIFT);
413 if (env->efer & MSR_EFER_LMA) {
414 hflags |= HF_LMA_MASK;
417 if ((hflags & HF_LMA_MASK) && (env->segs[R_CS].flags & DESC_L_MASK)) {
418 hflags |= HF_CS32_MASK | HF_SS32_MASK | HF_CS64_MASK;
419 } else {
420 hflags |= (env->segs[R_CS].flags & DESC_B_MASK) >>
421 (DESC_B_SHIFT - HF_CS32_SHIFT);
422 hflags |= (env->segs[R_SS].flags & DESC_B_MASK) >>
423 (DESC_B_SHIFT - HF_SS32_SHIFT);
424 if (!(env->cr[0] & CR0_PE_MASK) ||
425 (env->eflags & VM_MASK) ||
426 !(hflags & HF_CS32_MASK)) {
427 hflags |= HF_ADDSEG_MASK;
428 } else {
429 hflags |= ((env->segs[R_DS].base |
430 env->segs[R_ES].base |
431 env->segs[R_SS].base) != 0) <<
432 HF_ADDSEG_SHIFT;
435 env->hflags = (env->hflags & HFLAG_COPY_MASK) | hflags;
436 env->cc_src = env->eflags & (CC_O | CC_S | CC_Z | CC_A | CC_P | CC_C);
437 env->df = 1 - (2 * ((env->eflags >> 10) & 1));
438 env->cc_op = CC_OP_EFLAGS;
439 env->eflags &= ~(DF_MASK | CC_O | CC_S | CC_Z | CC_A | CC_P | CC_C);
441 /* msrs */
442 n = 0;
443 msrs[n++].index = MSR_IA32_SYSENTER_CS;
444 msrs[n++].index = MSR_IA32_SYSENTER_ESP;
445 msrs[n++].index = MSR_IA32_SYSENTER_EIP;
446 if (kvm_has_msr_star)
447 msrs[n++].index = MSR_STAR;
448 msrs[n++].index = MSR_IA32_TSC;
449 #ifdef TARGET_X86_64
450 if (lm_capable_kernel) {
451 msrs[n++].index = MSR_CSTAR;
452 msrs[n++].index = MSR_KERNELGSBASE;
453 msrs[n++].index = MSR_FMASK;
454 msrs[n++].index = MSR_LSTAR;
456 #endif
457 rc = kvm_get_msrs(kvm_context, env->cpu_index, msrs, n);
458 if (rc == -1) {
459 perror("kvm_get_msrs FAILED");
461 else {
462 n = rc; /* actual number of MSRs */
463 for (i=0 ; i<n; i++) {
464 if (get_msr_entry(&msrs[i], env))
465 return;
470 #include <signal.h>
473 static int try_push_interrupts(void *opaque)
475 CPUState *env = cpu_single_env;
476 int r, irq;
478 if (env->ready_for_interrupt_injection &&
479 (env->interrupt_request & CPU_INTERRUPT_HARD) &&
480 (env->eflags & IF_MASK)) {
481 env->interrupt_request &= ~CPU_INTERRUPT_HARD;
482 irq = cpu_get_pic_interrupt(env);
483 if (irq >= 0) {
484 r = kvm_inject_irq(kvm_context, env->cpu_index, irq);
485 if (r < 0)
486 printf("cpu %d fail inject %x\n", env->cpu_index, irq);
490 return (env->interrupt_request & CPU_INTERRUPT_HARD) != 0;
493 static void post_kvm_run(void *opaque, int vcpu)
495 CPUState *env = vcpu_env;
497 pthread_mutex_lock(&qemu_mutex);
498 cpu_single_env = env;
499 env->eflags = kvm_get_interrupt_flag(kvm_context, vcpu)
500 ? env->eflags | IF_MASK : env->eflags & ~IF_MASK;
501 env->ready_for_interrupt_injection
502 = kvm_is_ready_for_interrupt_injection(kvm_context, vcpu);
504 cpu_set_apic_tpr(env, kvm_get_cr8(kvm_context, vcpu));
505 cpu_set_apic_base(env, kvm_get_apic_base(kvm_context, vcpu));
508 static int pre_kvm_run(void *opaque, int vcpu)
510 CPUState *env = cpu_single_env;
512 if (env->cpu_index == 0 && wait_hack) {
513 int i;
515 wait_hack = 0;
517 pthread_mutex_unlock(&qemu_mutex);
518 for (i = 0; i < 10; ++i)
519 usleep(1000);
520 pthread_mutex_lock(&qemu_mutex);
523 if (!kvm_irqchip_in_kernel(kvm_context))
524 kvm_set_cr8(kvm_context, vcpu, cpu_get_apic_tpr(env));
525 if (env->interrupt_request & CPU_INTERRUPT_EXIT)
526 return 1;
527 pthread_mutex_unlock(&qemu_mutex);
528 return 0;
531 void kvm_load_registers(CPUState *env)
533 if (kvm_allowed)
534 load_regs(env);
537 void kvm_save_registers(CPUState *env)
539 if (kvm_allowed)
540 save_regs(env);
543 int kvm_cpu_exec(CPUState *env)
545 int r;
547 r = kvm_run(kvm_context, env->cpu_index);
548 if (r < 0) {
549 printf("kvm_run returned %d\n", r);
550 exit(1);
553 return 0;
556 extern int vm_running;
558 static int has_work(CPUState *env)
560 if (!vm_running)
561 return 0;
562 if (!(env->hflags & HF_HALTED_MASK))
563 return 1;
564 if ((env->interrupt_request & (CPU_INTERRUPT_HARD | CPU_INTERRUPT_EXIT)) &&
565 (env->eflags & IF_MASK))
566 return 1;
567 return 0;
570 static int kvm_eat_signal(CPUState *env, int timeout)
572 struct timespec ts;
573 int r, e, ret = 0;
574 siginfo_t siginfo;
575 struct sigaction sa;
577 ts.tv_sec = timeout / 1000;
578 ts.tv_nsec = (timeout % 1000) * 1000000;
579 r = sigtimedwait(&io_sigset, &siginfo, &ts);
580 if (r == -1 && (errno == EAGAIN || errno == EINTR) && !timeout)
581 return 0;
582 e = errno;
583 pthread_mutex_lock(&qemu_mutex);
584 cpu_single_env = vcpu_env;
585 if (r == -1 && !(errno == EAGAIN || errno == EINTR)) {
586 printf("sigtimedwait: %s\n", strerror(e));
587 exit(1);
589 if (r != -1) {
590 sigaction(siginfo.si_signo, NULL, &sa);
591 sa.sa_handler(siginfo.si_signo);
592 ret = 1;
594 pthread_mutex_unlock(&qemu_mutex);
596 return ret;
600 static void kvm_eat_signals(CPUState *env, int timeout)
602 int r = 0;
604 while (kvm_eat_signal(env, 0))
605 r = 1;
606 if (!r && timeout) {
607 r = kvm_eat_signal(env, timeout);
608 if (r)
609 while (kvm_eat_signal(env, 0))
613 * we call select() even if no signal was received, to account for
614 * for which there is no signal handler installed.
616 pthread_mutex_lock(&qemu_mutex);
617 cpu_single_env = vcpu_env;
618 main_loop_wait(0);
619 pthread_mutex_unlock(&qemu_mutex);
622 static void kvm_main_loop_wait(CPUState *env, int timeout)
624 pthread_mutex_unlock(&qemu_mutex);
625 if (env->cpu_index == 0)
626 kvm_eat_signals(env, timeout);
627 else {
628 if (!kvm_irqchip_in_kernel(kvm_context) &&
629 (timeout || vcpu_info[env->cpu_index].stopped)) {
630 sigset_t set;
631 int n;
633 paused:
634 sigemptyset(&set);
635 sigaddset(&set, SIG_IPI);
636 sigwait(&set, &n);
637 } else {
638 struct timespec ts;
639 siginfo_t siginfo;
640 sigset_t set;
642 ts.tv_sec = 0;
643 ts.tv_nsec = 0;
644 sigemptyset(&set);
645 sigaddset(&set, SIG_IPI);
646 sigtimedwait(&set, &siginfo, &ts);
648 if (vcpu_info[env->cpu_index].stop) {
649 vcpu_info[env->cpu_index].stop = 0;
650 vcpu_info[env->cpu_index].stopped = 1;
651 pthread_kill(vcpu_info[0].thread, SIG_IPI);
652 goto paused;
655 pthread_mutex_lock(&qemu_mutex);
656 cpu_single_env = env;
657 vcpu_info[env->cpu_index].signalled = 0;
660 static int all_threads_paused(void)
662 int i;
664 for (i = 1; i < smp_cpus; ++i)
665 if (vcpu_info[i].stopped)
666 return 0;
667 return 1;
670 static void pause_other_threads(void)
672 int i;
674 for (i = 1; i < smp_cpus; ++i) {
675 vcpu_info[i].stop = 1;
676 pthread_kill(vcpu_info[i].thread, SIG_IPI);
678 while (!all_threads_paused())
679 kvm_eat_signals(vcpu_env, 0);
682 static void resume_other_threads(void)
684 int i;
686 for (i = 1; i < smp_cpus; ++i) {
687 vcpu_info[i].stop = 0;
688 vcpu_info[i].stopped = 0;
689 pthread_kill(vcpu_info[i].thread, SIG_IPI);
693 static void kvm_vm_state_change_handler(void *context, int running)
695 if (running)
696 resume_other_threads();
697 else
698 pause_other_threads();
701 static void update_regs_for_sipi(CPUState *env)
703 SegmentCache cs = env->segs[R_CS];
705 save_regs(env);
706 env->segs[R_CS] = cs;
707 env->eip = 0;
708 load_regs(env);
709 vcpu_info[env->cpu_index].sipi_needed = 0;
710 vcpu_info[env->cpu_index].init = 0;
713 static void update_regs_for_init(CPUState *env)
715 cpu_reset(env);
716 load_regs(env);
719 static void setup_kernel_sigmask(CPUState *env)
721 sigset_t set;
723 sigprocmask(SIG_BLOCK, NULL, &set);
724 sigdelset(&set, SIG_IPI);
725 if (env->cpu_index == 0)
726 sigandset(&set, &set, &io_negsigset);
728 kvm_set_signal_mask(kvm_context, env->cpu_index, &set);
731 static int kvm_main_loop_cpu(CPUState *env)
733 struct vcpu_info *info = &vcpu_info[env->cpu_index];
735 setup_kernel_sigmask(env);
736 pthread_mutex_lock(&qemu_mutex);
737 cpu_single_env = env;
738 while (1) {
739 while (!has_work(env))
740 kvm_main_loop_wait(env, 10);
741 if (env->interrupt_request & CPU_INTERRUPT_HARD)
742 env->hflags &= ~HF_HALTED_MASK;
743 if (!kvm_irqchip_in_kernel(kvm_context) && info->sipi_needed)
744 update_regs_for_sipi(env);
745 if (!kvm_irqchip_in_kernel(kvm_context) && info->init)
746 update_regs_for_init(env);
747 if (!(env->hflags & HF_HALTED_MASK) && !info->init)
748 kvm_cpu_exec(env);
749 env->interrupt_request &= ~CPU_INTERRUPT_EXIT;
750 kvm_main_loop_wait(env, 0);
751 if (qemu_shutdown_requested())
752 break;
753 else if (qemu_powerdown_requested())
754 qemu_system_powerdown();
755 else if (qemu_reset_requested()) {
756 env->interrupt_request = 0;
757 qemu_system_reset();
758 load_regs(env);
761 pthread_mutex_unlock(&qemu_mutex);
762 return 0;
765 static void *ap_main_loop(void *_env)
767 CPUState *env = _env;
768 sigset_t signals;
770 vcpu_env = env;
771 sigfillset(&signals);
772 //sigdelset(&signals, SIG_IPI);
773 sigprocmask(SIG_BLOCK, &signals, NULL);
774 kvm_create_vcpu(kvm_context, env->cpu_index);
775 kvm_qemu_init_env(env);
776 if (kvm_irqchip_in_kernel(kvm_context))
777 env->hflags &= ~HF_HALTED_MASK;
778 kvm_main_loop_cpu(env);
779 return NULL;
782 static void kvm_add_signal(int signum)
784 sigaddset(&io_sigset, signum);
785 sigdelset(&io_negsigset, signum);
786 sigprocmask(SIG_BLOCK, &io_sigset, NULL);
789 int kvm_init_ap(void)
791 CPUState *env = first_cpu->next_cpu;
792 int i;
794 qemu_add_vm_change_state_handler(kvm_vm_state_change_handler, NULL);
795 sigemptyset(&io_sigset);
796 sigfillset(&io_negsigset);
797 kvm_add_signal(SIGIO);
798 kvm_add_signal(SIGALRM);
799 kvm_add_signal(SIGUSR2);
800 if (!kvm_irqchip_in_kernel(kvm_context))
801 kvm_add_signal(SIG_IPI);
803 vcpu_env = first_cpu;
804 signal(SIG_IPI, sig_ipi_handler);
805 for (i = 1; i < smp_cpus; ++i) {
806 pthread_create(&vcpu_info[i].thread, NULL, ap_main_loop, env);
807 env = env->next_cpu;
809 return 0;
812 int kvm_main_loop(void)
814 vcpu_info[0].thread = pthread_self();
815 return kvm_main_loop_cpu(first_cpu);
818 static int kvm_debug(void *opaque, int vcpu)
820 CPUState *env = cpu_single_env;
822 env->exception_index = EXCP_DEBUG;
823 return 1;
826 static int kvm_inb(void *opaque, uint16_t addr, uint8_t *data)
828 *data = cpu_inb(0, addr);
829 return 0;
832 static int kvm_inw(void *opaque, uint16_t addr, uint16_t *data)
834 *data = cpu_inw(0, addr);
835 return 0;
838 static int kvm_inl(void *opaque, uint16_t addr, uint32_t *data)
840 *data = cpu_inl(0, addr);
841 return 0;
844 #define PM_IO_BASE 0xb000
846 static int kvm_outb(void *opaque, uint16_t addr, uint8_t data)
848 if (addr == 0xb2) {
849 switch (data) {
850 case 0: {
851 cpu_outb(0, 0xb3, 0);
852 break;
854 case 0xf0: {
855 unsigned x;
857 /* enable acpi */
858 x = cpu_inw(0, PM_IO_BASE + 4);
859 x &= ~1;
860 cpu_outw(0, PM_IO_BASE + 4, x);
861 break;
863 case 0xf1: {
864 unsigned x;
866 /* enable acpi */
867 x = cpu_inw(0, PM_IO_BASE + 4);
868 x |= 1;
869 cpu_outw(0, PM_IO_BASE + 4, x);
870 break;
872 default:
873 break;
875 return 0;
877 cpu_outb(0, addr, data);
878 return 0;
881 static int kvm_outw(void *opaque, uint16_t addr, uint16_t data)
883 cpu_outw(0, addr, data);
884 return 0;
887 static int kvm_outl(void *opaque, uint16_t addr, uint32_t data)
889 cpu_outl(0, addr, data);
890 return 0;
893 static int kvm_readb(void *opaque, uint64_t addr, uint8_t *data)
895 *data = ldub_phys(addr);
896 return 0;
899 static int kvm_readw(void *opaque, uint64_t addr, uint16_t *data)
901 *data = lduw_phys(addr);
902 return 0;
905 static int kvm_readl(void *opaque, uint64_t addr, uint32_t *data)
907 /* hack: Red Hat 7.1 generates some wierd accesses. */
908 if (addr > 0xa0000 - 4 && addr < 0xa0000) {
909 *data = 0;
910 return 0;
913 *data = ldl_phys(addr);
914 return 0;
917 static int kvm_readq(void *opaque, uint64_t addr, uint64_t *data)
919 *data = ldq_phys(addr);
920 return 0;
923 static int kvm_writeb(void *opaque, uint64_t addr, uint8_t data)
925 stb_phys(addr, data);
926 return 0;
929 static int kvm_writew(void *opaque, uint64_t addr, uint16_t data)
931 stw_phys(addr, data);
932 return 0;
935 static int kvm_writel(void *opaque, uint64_t addr, uint32_t data)
937 stl_phys(addr, data);
938 return 0;
941 static int kvm_writeq(void *opaque, uint64_t addr, uint64_t data)
943 stq_phys(addr, data);
944 return 0;
947 static int kvm_io_window(void *opaque)
949 return 1;
953 static int kvm_halt(void *opaque, int vcpu)
955 CPUState *env = cpu_single_env;
957 if (!((env->interrupt_request & CPU_INTERRUPT_HARD) &&
958 (env->eflags & IF_MASK))) {
959 env->hflags |= HF_HALTED_MASK;
960 env->exception_index = EXCP_HLT;
963 return 1;
966 static int kvm_shutdown(void *opaque, int vcpu)
968 qemu_system_reset_request();
969 return 1;
972 static struct kvm_callbacks qemu_kvm_ops = {
973 .debug = kvm_debug,
974 .inb = kvm_inb,
975 .inw = kvm_inw,
976 .inl = kvm_inl,
977 .outb = kvm_outb,
978 .outw = kvm_outw,
979 .outl = kvm_outl,
980 .readb = kvm_readb,
981 .readw = kvm_readw,
982 .readl = kvm_readl,
983 .readq = kvm_readq,
984 .writeb = kvm_writeb,
985 .writew = kvm_writew,
986 .writel = kvm_writel,
987 .writeq = kvm_writeq,
988 .halt = kvm_halt,
989 .shutdown = kvm_shutdown,
990 .io_window = kvm_io_window,
991 .try_push_interrupts = try_push_interrupts,
992 .post_kvm_run = post_kvm_run,
993 .pre_kvm_run = pre_kvm_run,
996 int kvm_qemu_init()
998 /* Try to initialize kvm */
999 kvm_context = kvm_init(&qemu_kvm_ops, cpu_single_env);
1000 if (!kvm_context) {
1001 return -1;
1004 return 0;
1007 int kvm_qemu_create_context(void)
1009 int i;
1011 if (!kvm_irqchip) {
1012 kvm_disable_irqchip_creation(kvm_context);
1014 if (kvm_create(kvm_context, phys_ram_size, (void**)&phys_ram_base) < 0) {
1015 kvm_qemu_destroy();
1016 return -1;
1018 if (kvm_shadow_memory)
1019 kvm_set_shadow_pages(kvm_context, kvm_shadow_memory);
1020 kvm_msr_list = kvm_get_msr_list(kvm_context);
1021 if (!kvm_msr_list) {
1022 kvm_qemu_destroy();
1023 return -1;
1025 for (i = 0; i < kvm_msr_list->nmsrs; ++i)
1026 if (kvm_msr_list->indices[i] == MSR_STAR)
1027 kvm_has_msr_star = 1;
1028 return 0;
1031 void kvm_qemu_destroy(void)
1033 kvm_finalize(kvm_context);
1036 static void host_cpuid(uint32_t function, uint32_t *eax, uint32_t *ebx,
1037 uint32_t *ecx, uint32_t *edx)
1039 uint32_t vec[4];
1041 vec[0] = function;
1042 asm volatile (
1043 #ifdef __x86_64__
1044 "sub $128, %%rsp \n\t" /* skip red zone */
1045 "push %0; push %%rsi \n\t"
1046 "push %%rax; push %%rbx; push %%rcx; push %%rdx \n\t"
1047 "mov 8*5(%%rsp), %%rsi \n\t"
1048 "mov (%%rsi), %%eax \n\t"
1049 "cpuid \n\t"
1050 "mov %%eax, (%%rsi) \n\t"
1051 "mov %%ebx, 4(%%rsi) \n\t"
1052 "mov %%ecx, 8(%%rsi) \n\t"
1053 "mov %%edx, 12(%%rsi) \n\t"
1054 "pop %%rdx; pop %%rcx; pop %%rbx; pop %%rax \n\t"
1055 "pop %%rsi; pop %0 \n\t"
1056 "add $128, %%rsp"
1057 #else
1058 "push %0; push %%esi \n\t"
1059 "push %%eax; push %%ebx; push %%ecx; push %%edx \n\t"
1060 "mov 4*5(%%esp), %%esi \n\t"
1061 "mov (%%esi), %%eax \n\t"
1062 "cpuid \n\t"
1063 "mov %%eax, (%%esi) \n\t"
1064 "mov %%ebx, 4(%%esi) \n\t"
1065 "mov %%ecx, 8(%%esi) \n\t"
1066 "mov %%edx, 12(%%esi) \n\t"
1067 "pop %%edx; pop %%ecx; pop %%ebx; pop %%eax \n\t"
1068 "pop %%esi; pop %0 \n\t"
1069 #endif
1070 : : "rm"(vec) : "memory");
1071 if (eax)
1072 *eax = vec[0];
1073 if (ebx)
1074 *ebx = vec[1];
1075 if (ecx)
1076 *ecx = vec[2];
1077 if (edx)
1078 *edx = vec[3];
1081 static void do_cpuid_ent(struct kvm_cpuid_entry *e, uint32_t function,
1082 CPUState *env)
1084 env->regs[R_EAX] = function;
1085 qemu_kvm_cpuid_on_env(env);
1086 e->function = function;
1087 e->eax = env->regs[R_EAX];
1088 e->ebx = env->regs[R_EBX];
1089 e->ecx = env->regs[R_ECX];
1090 e->edx = env->regs[R_EDX];
1091 if (function == 0x80000001) {
1092 uint32_t h_eax, h_edx;
1093 struct utsname utsname;
1095 host_cpuid(function, &h_eax, NULL, NULL, &h_edx);
1096 uname(&utsname);
1097 lm_capable_kernel = strcmp(utsname.machine, "x86_64") == 0;
1099 // long mode
1100 if ((h_edx & 0x20000000) == 0 || !lm_capable_kernel)
1101 e->edx &= ~0x20000000u;
1102 // syscall
1103 if ((h_edx & 0x00000800) == 0)
1104 e->edx &= ~0x00000800u;
1105 // nx
1106 if ((h_edx & 0x00100000) == 0)
1107 e->edx &= ~0x00100000u;
1108 // svm
1109 if (e->ecx & 4)
1110 e->ecx &= ~4u;
1112 // sysenter isn't supported on compatibility mode on AMD. and syscall
1113 // isn't supported in compatibility mode on Intel. so advertise the
1114 // actuall cpu, and say goodbye to migration between different vendors
1115 // is you use compatibility mode.
1116 if (function == 0) {
1117 uint32_t bcd[3];
1119 host_cpuid(0, NULL, &bcd[0], &bcd[1], &bcd[2]);
1120 e->ebx = bcd[0];
1121 e->ecx = bcd[1];
1122 e->edx = bcd[2];
1126 int kvm_qemu_init_env(CPUState *cenv)
1128 struct kvm_cpuid_entry cpuid_ent[100];
1129 #ifdef KVM_CPUID_SIGNATURE
1130 struct kvm_cpuid_entry *pv_ent;
1131 uint32_t signature[3];
1132 #endif
1133 int cpuid_nent = 0;
1134 CPUState copy;
1135 uint32_t i, limit;
1137 copy = *cenv;
1139 #ifdef KVM_CPUID_SIGNATURE
1140 /* Paravirtualization CPUIDs */
1141 memcpy(signature, "KVMKVMKVM", 12);
1142 pv_ent = &cpuid_ent[cpuid_nent++];
1143 memset(pv_ent, 0, sizeof(*pv_ent));
1144 pv_ent->function = KVM_CPUID_SIGNATURE;
1145 pv_ent->eax = 0;
1146 pv_ent->ebx = signature[0];
1147 pv_ent->ecx = signature[1];
1148 pv_ent->edx = signature[2];
1150 pv_ent = &cpuid_ent[cpuid_nent++];
1151 memset(pv_ent, 0, sizeof(*pv_ent));
1152 pv_ent->function = KVM_CPUID_FEATURES;
1153 pv_ent->eax = 0;
1154 #endif
1156 copy.regs[R_EAX] = 0;
1157 qemu_kvm_cpuid_on_env(&copy);
1158 limit = copy.regs[R_EAX];
1160 for (i = 0; i <= limit; ++i)
1161 do_cpuid_ent(&cpuid_ent[cpuid_nent++], i, &copy);
1163 copy.regs[R_EAX] = 0x80000000;
1164 qemu_kvm_cpuid_on_env(&copy);
1165 limit = copy.regs[R_EAX];
1167 for (i = 0x80000000; i <= limit; ++i)
1168 do_cpuid_ent(&cpuid_ent[cpuid_nent++], i, &copy);
1170 kvm_setup_cpuid(kvm_context, cenv->cpu_index, cpuid_nent, cpuid_ent);
1172 return 0;
1175 int kvm_update_debugger(CPUState *env)
1177 struct kvm_debug_guest dbg;
1178 int i;
1180 dbg.enabled = 0;
1181 if (env->nb_breakpoints || env->singlestep_enabled) {
1182 dbg.enabled = 1;
1183 for (i = 0; i < 4 && i < env->nb_breakpoints; ++i) {
1184 dbg.breakpoints[i].enabled = 1;
1185 dbg.breakpoints[i].address = env->breakpoints[i];
1187 dbg.singlestep = env->singlestep_enabled;
1189 return kvm_guest_debug(kvm_context, env->cpu_index, &dbg);
1194 * dirty pages logging
1196 /* FIXME: use unsigned long pointer instead of unsigned char */
1197 unsigned char *kvm_dirty_bitmap = NULL;
1198 int kvm_physical_memory_set_dirty_tracking(int enable)
1200 int r = 0;
1202 if (!kvm_allowed)
1203 return 0;
1205 if (enable) {
1206 if (!kvm_dirty_bitmap) {
1207 unsigned bitmap_size = BITMAP_SIZE(phys_ram_size);
1208 kvm_dirty_bitmap = qemu_malloc(bitmap_size);
1209 if (kvm_dirty_bitmap == NULL) {
1210 perror("Failed to allocate dirty pages bitmap");
1211 r=-1;
1213 else {
1214 r = kvm_dirty_pages_log_enable_all(kvm_context);
1218 else {
1219 if (kvm_dirty_bitmap) {
1220 r = kvm_dirty_pages_log_reset(kvm_context);
1221 qemu_free(kvm_dirty_bitmap);
1222 kvm_dirty_bitmap = NULL;
1225 return r;
1228 /* get kvm's dirty pages bitmap and update qemu's */
1229 int kvm_get_dirty_pages_log_slot(unsigned long start_addr,
1230 unsigned char *bitmap,
1231 unsigned int offset,
1232 unsigned int len)
1234 int r;
1235 unsigned int i, j, n=0;
1236 unsigned char c;
1237 unsigned page_number, addr, addr1;
1239 memset(bitmap, 0, len);
1240 r = kvm_get_dirty_pages(kvm_context, start_addr, bitmap);
1241 if (r)
1242 return r;
1245 * bitmap-traveling is faster than memory-traveling (for addr...)
1246 * especially when most of the memory is not dirty.
1248 for (i=0; i<len; i++) {
1249 c = bitmap[i];
1250 while (c>0) {
1251 j = ffsl(c) - 1;
1252 c &= ~(1u<<j);
1253 page_number = i * 8 + j;
1254 addr1 = page_number * TARGET_PAGE_SIZE;
1255 addr = offset + addr1;
1256 cpu_physical_memory_set_dirty(addr);
1257 n++;
1260 return 0;
1264 * get kvm's dirty pages bitmap and update qemu's
1265 * we only care about physical ram, which resides in slots 0 and 3
1267 int kvm_update_dirty_pages_log(void)
1269 int r = 0, len;
1271 len = BITMAP_SIZE(0xa0000);
1272 r = kvm_get_dirty_pages_log_slot(0, kvm_dirty_bitmap, 0 , len);
1273 len = BITMAP_SIZE(phys_ram_size - 0xc0000);
1274 r = r || kvm_get_dirty_pages_log_slot(0xc0000, kvm_dirty_bitmap, 0xc0000, len);
1275 return r;
1278 int kvm_get_phys_ram_page_bitmap(unsigned char *bitmap)
1280 int r=0, len, offset;
1282 len = BITMAP_SIZE(phys_ram_size);
1283 memset(bitmap, 0, len);
1285 r = kvm_get_mem_map(kvm_context, 0, bitmap);
1286 if (r)
1287 goto out;
1289 offset = BITMAP_SIZE(0xc0000);
1290 r = kvm_get_mem_map(kvm_context, 0xc0000, bitmap + offset);
1292 out:
1293 return r;
1296 #ifdef KVM_CAP_IRQCHIP
1298 int kvm_set_irq(int irq, int level)
1300 return kvm_set_irq_level(kvm_context, irq, level);
1303 #endif
1305 #endif