3 #include "config-host.h"
6 #define KVM_ALLOWED_DEFAULT 1
8 #define KVM_ALLOWED_DEFAULT 0
11 int kvm_allowed
= KVM_ALLOWED_DEFAULT
;
12 static int lm_capable_kernel
;
22 #include <sys/utsname.h>
24 #define MSR_IA32_TSC 0x10
26 extern void perror(const char *s
);
28 kvm_context_t kvm_context
;
29 static struct kvm_msr_list
*kvm_msr_list
;
30 static int kvm_has_msr_star
;
34 pthread_mutex_t qemu_mutex
= PTHREAD_MUTEX_INITIALIZER
;
35 static __thread CPUState
*vcpu_env
;
37 static sigset_t io_sigset
, io_negsigset
;
41 #define SIG_IPI (SIGRTMIN+4)
52 static void sig_ipi_handler(int n
)
56 void kvm_update_interrupt_request(CPUState
*env
)
58 if (env
&& env
!= vcpu_env
) {
59 if (vcpu_info
[env
->cpu_index
].signalled
)
61 vcpu_info
[env
->cpu_index
].signalled
= 1;
62 if (vcpu_info
[env
->cpu_index
].thread
)
63 pthread_kill(vcpu_info
[env
->cpu_index
].thread
, SIG_IPI
);
67 void kvm_update_after_sipi(CPUState
*env
)
69 vcpu_info
[env
->cpu_index
].sipi_needed
= 1;
70 kvm_update_interrupt_request(env
);
73 * the qemu bios waits using a busy loop that's much too short for
74 * kvm. add a wait after the first sipi.
77 static int first_sipi
= 1;
86 void kvm_apic_init(CPUState
*env
)
88 vcpu_info
[env
->cpu_index
].init
= 1;
89 kvm_update_interrupt_request(env
);
92 static void set_msr_entry(struct kvm_msr_entry
*entry
, uint32_t index
,
99 /* returns 0 on success, non-0 on failure */
100 static int get_msr_entry(struct kvm_msr_entry
*entry
, CPUState
*env
)
102 switch (entry
->index
) {
103 case MSR_IA32_SYSENTER_CS
:
104 env
->sysenter_cs
= entry
->data
;
106 case MSR_IA32_SYSENTER_ESP
:
107 env
->sysenter_esp
= entry
->data
;
109 case MSR_IA32_SYSENTER_EIP
:
110 env
->sysenter_eip
= entry
->data
;
113 env
->star
= entry
->data
;
117 env
->cstar
= entry
->data
;
119 case MSR_KERNELGSBASE
:
120 env
->kernelgsbase
= entry
->data
;
123 env
->fmask
= entry
->data
;
126 env
->lstar
= entry
->data
;
130 env
->tsc
= entry
->data
;
133 printf("Warning unknown msr index 0x%x\n", entry
->index
);
145 static void set_v8086_seg(struct kvm_segment
*lhs
, const SegmentCache
*rhs
)
147 lhs
->selector
= rhs
->selector
;
148 lhs
->base
= rhs
->base
;
149 lhs
->limit
= rhs
->limit
;
161 static void set_seg(struct kvm_segment
*lhs
, const SegmentCache
*rhs
)
163 unsigned flags
= rhs
->flags
;
164 lhs
->selector
= rhs
->selector
;
165 lhs
->base
= rhs
->base
;
166 lhs
->limit
= rhs
->limit
;
167 lhs
->type
= (flags
>> DESC_TYPE_SHIFT
) & 15;
168 lhs
->present
= (flags
& DESC_P_MASK
) != 0;
169 lhs
->dpl
= rhs
->selector
& 3;
170 lhs
->db
= (flags
>> DESC_B_SHIFT
) & 1;
171 lhs
->s
= (flags
& DESC_S_MASK
) != 0;
172 lhs
->l
= (flags
>> DESC_L_SHIFT
) & 1;
173 lhs
->g
= (flags
& DESC_G_MASK
) != 0;
174 lhs
->avl
= (flags
& DESC_AVL_MASK
) != 0;
178 static void get_seg(SegmentCache
*lhs
, const struct kvm_segment
*rhs
)
180 lhs
->selector
= rhs
->selector
;
181 lhs
->base
= rhs
->base
;
182 lhs
->limit
= rhs
->limit
;
184 (rhs
->type
<< DESC_TYPE_SHIFT
)
185 | (rhs
->present
* DESC_P_MASK
)
186 | (rhs
->dpl
<< DESC_DPL_SHIFT
)
187 | (rhs
->db
<< DESC_B_SHIFT
)
188 | (rhs
->s
* DESC_S_MASK
)
189 | (rhs
->l
<< DESC_L_SHIFT
)
190 | (rhs
->g
* DESC_G_MASK
)
191 | (rhs
->avl
* DESC_AVL_MASK
);
194 /* the reset values of qemu are not compatible to SVM
195 * this function is used to fix the segment descriptor values */
196 static void fix_realmode_dataseg(struct kvm_segment
*seg
)
203 static void load_regs(CPUState
*env
)
205 struct kvm_regs regs
;
207 struct kvm_sregs sregs
;
208 struct kvm_msr_entry msrs
[MSR_COUNT
];
211 regs
.rax
= env
->regs
[R_EAX
];
212 regs
.rbx
= env
->regs
[R_EBX
];
213 regs
.rcx
= env
->regs
[R_ECX
];
214 regs
.rdx
= env
->regs
[R_EDX
];
215 regs
.rsi
= env
->regs
[R_ESI
];
216 regs
.rdi
= env
->regs
[R_EDI
];
217 regs
.rsp
= env
->regs
[R_ESP
];
218 regs
.rbp
= env
->regs
[R_EBP
];
220 regs
.r8
= env
->regs
[8];
221 regs
.r9
= env
->regs
[9];
222 regs
.r10
= env
->regs
[10];
223 regs
.r11
= env
->regs
[11];
224 regs
.r12
= env
->regs
[12];
225 regs
.r13
= env
->regs
[13];
226 regs
.r14
= env
->regs
[14];
227 regs
.r15
= env
->regs
[15];
230 regs
.rflags
= env
->eflags
;
233 kvm_set_regs(kvm_context
, env
->cpu_index
, ®s
);
235 memset(&fpu
, 0, sizeof fpu
);
236 fpu
.fsw
= env
->fpus
& ~(7 << 11);
237 fpu
.fsw
|= (env
->fpstt
& 7) << 11;
239 for (i
= 0; i
< 8; ++i
)
240 fpu
.ftwx
|= (!env
->fptags
[i
]) << i
;
241 memcpy(fpu
.fpr
, env
->fpregs
, sizeof env
->fpregs
);
242 memcpy(fpu
.xmm
, env
->xmm_regs
, sizeof env
->xmm_regs
);
243 fpu
.mxcsr
= env
->mxcsr
;
244 kvm_set_fpu(kvm_context
, env
->cpu_index
, &fpu
);
246 memcpy(sregs
.interrupt_bitmap
, env
->kvm_interrupt_bitmap
, sizeof(sregs
.interrupt_bitmap
));
248 if ((env
->eflags
& VM_MASK
)) {
249 set_v8086_seg(&sregs
.cs
, &env
->segs
[R_CS
]);
250 set_v8086_seg(&sregs
.ds
, &env
->segs
[R_DS
]);
251 set_v8086_seg(&sregs
.es
, &env
->segs
[R_ES
]);
252 set_v8086_seg(&sregs
.fs
, &env
->segs
[R_FS
]);
253 set_v8086_seg(&sregs
.gs
, &env
->segs
[R_GS
]);
254 set_v8086_seg(&sregs
.ss
, &env
->segs
[R_SS
]);
256 set_seg(&sregs
.cs
, &env
->segs
[R_CS
]);
257 set_seg(&sregs
.ds
, &env
->segs
[R_DS
]);
258 set_seg(&sregs
.es
, &env
->segs
[R_ES
]);
259 set_seg(&sregs
.fs
, &env
->segs
[R_FS
]);
260 set_seg(&sregs
.gs
, &env
->segs
[R_GS
]);
261 set_seg(&sregs
.ss
, &env
->segs
[R_SS
]);
263 if (env
->cr
[0] & CR0_PE_MASK
) {
264 /* force ss cpl to cs cpl */
265 sregs
.ss
.selector
= (sregs
.ss
.selector
& ~3) |
266 (sregs
.cs
.selector
& 3);
267 sregs
.ss
.dpl
= sregs
.ss
.selector
& 3;
270 if (!(env
->cr
[0] & CR0_PG_MASK
)) {
271 fix_realmode_dataseg(&sregs
.cs
);
272 fix_realmode_dataseg(&sregs
.ds
);
273 fix_realmode_dataseg(&sregs
.es
);
274 fix_realmode_dataseg(&sregs
.fs
);
275 fix_realmode_dataseg(&sregs
.gs
);
276 fix_realmode_dataseg(&sregs
.ss
);
280 set_seg(&sregs
.tr
, &env
->tr
);
281 set_seg(&sregs
.ldt
, &env
->ldt
);
283 sregs
.idt
.limit
= env
->idt
.limit
;
284 sregs
.idt
.base
= env
->idt
.base
;
285 sregs
.gdt
.limit
= env
->gdt
.limit
;
286 sregs
.gdt
.base
= env
->gdt
.base
;
288 sregs
.cr0
= env
->cr
[0];
289 sregs
.cr2
= env
->cr
[2];
290 sregs
.cr3
= env
->cr
[3];
291 sregs
.cr4
= env
->cr
[4];
293 sregs
.apic_base
= cpu_get_apic_base(env
);
294 sregs
.efer
= env
->efer
;
295 sregs
.cr8
= cpu_get_apic_tpr(env
);
297 kvm_set_sregs(kvm_context
, env
->cpu_index
, &sregs
);
301 set_msr_entry(&msrs
[n
++], MSR_IA32_SYSENTER_CS
, env
->sysenter_cs
);
302 set_msr_entry(&msrs
[n
++], MSR_IA32_SYSENTER_ESP
, env
->sysenter_esp
);
303 set_msr_entry(&msrs
[n
++], MSR_IA32_SYSENTER_EIP
, env
->sysenter_eip
);
304 if (kvm_has_msr_star
)
305 set_msr_entry(&msrs
[n
++], MSR_STAR
, env
->star
);
306 set_msr_entry(&msrs
[n
++], MSR_IA32_TSC
, env
->tsc
);
308 if (lm_capable_kernel
) {
309 set_msr_entry(&msrs
[n
++], MSR_CSTAR
, env
->cstar
);
310 set_msr_entry(&msrs
[n
++], MSR_KERNELGSBASE
, env
->kernelgsbase
);
311 set_msr_entry(&msrs
[n
++], MSR_FMASK
, env
->fmask
);
312 set_msr_entry(&msrs
[n
++], MSR_LSTAR
, env
->lstar
);
316 rc
= kvm_set_msrs(kvm_context
, env
->cpu_index
, msrs
, n
);
318 perror("kvm_set_msrs FAILED");
322 static void save_regs(CPUState
*env
)
324 struct kvm_regs regs
;
326 struct kvm_sregs sregs
;
327 struct kvm_msr_entry msrs
[MSR_COUNT
];
331 kvm_get_regs(kvm_context
, env
->cpu_index
, ®s
);
333 env
->regs
[R_EAX
] = regs
.rax
;
334 env
->regs
[R_EBX
] = regs
.rbx
;
335 env
->regs
[R_ECX
] = regs
.rcx
;
336 env
->regs
[R_EDX
] = regs
.rdx
;
337 env
->regs
[R_ESI
] = regs
.rsi
;
338 env
->regs
[R_EDI
] = regs
.rdi
;
339 env
->regs
[R_ESP
] = regs
.rsp
;
340 env
->regs
[R_EBP
] = regs
.rbp
;
342 env
->regs
[8] = regs
.r8
;
343 env
->regs
[9] = regs
.r9
;
344 env
->regs
[10] = regs
.r10
;
345 env
->regs
[11] = regs
.r11
;
346 env
->regs
[12] = regs
.r12
;
347 env
->regs
[13] = regs
.r13
;
348 env
->regs
[14] = regs
.r14
;
349 env
->regs
[15] = regs
.r15
;
352 env
->eflags
= regs
.rflags
;
355 kvm_get_fpu(kvm_context
, env
->cpu_index
, &fpu
);
356 env
->fpstt
= (fpu
.fsw
>> 11) & 7;
359 for (i
= 0; i
< 8; ++i
)
360 env
->fptags
[i
] = !((fpu
.ftwx
>> i
) & 1);
361 memcpy(env
->fpregs
, fpu
.fpr
, sizeof env
->fpregs
);
362 memcpy(env
->xmm_regs
, fpu
.xmm
, sizeof env
->xmm_regs
);
363 env
->mxcsr
= fpu
.mxcsr
;
365 kvm_get_sregs(kvm_context
, env
->cpu_index
, &sregs
);
367 memcpy(env
->kvm_interrupt_bitmap
, sregs
.interrupt_bitmap
, sizeof(env
->kvm_interrupt_bitmap
));
369 get_seg(&env
->segs
[R_CS
], &sregs
.cs
);
370 get_seg(&env
->segs
[R_DS
], &sregs
.ds
);
371 get_seg(&env
->segs
[R_ES
], &sregs
.es
);
372 get_seg(&env
->segs
[R_FS
], &sregs
.fs
);
373 get_seg(&env
->segs
[R_GS
], &sregs
.gs
);
374 get_seg(&env
->segs
[R_SS
], &sregs
.ss
);
376 get_seg(&env
->tr
, &sregs
.tr
);
377 get_seg(&env
->ldt
, &sregs
.ldt
);
379 env
->idt
.limit
= sregs
.idt
.limit
;
380 env
->idt
.base
= sregs
.idt
.base
;
381 env
->gdt
.limit
= sregs
.gdt
.limit
;
382 env
->gdt
.base
= sregs
.gdt
.base
;
384 env
->cr
[0] = sregs
.cr0
;
385 env
->cr
[2] = sregs
.cr2
;
386 env
->cr
[3] = sregs
.cr3
;
387 env
->cr
[4] = sregs
.cr4
;
389 cpu_set_apic_base(env
, sregs
.apic_base
);
391 env
->efer
= sregs
.efer
;
392 //cpu_set_apic_tpr(env, sregs.cr8);
394 #define HFLAG_COPY_MASK ~( \
395 HF_CPL_MASK | HF_PE_MASK | HF_MP_MASK | HF_EM_MASK | \
396 HF_TS_MASK | HF_TF_MASK | HF_VM_MASK | HF_IOPL_MASK | \
397 HF_OSFXSR_MASK | HF_LMA_MASK | HF_CS32_MASK | \
398 HF_SS32_MASK | HF_CS64_MASK | HF_ADDSEG_MASK)
402 hflags
= (env
->segs
[R_CS
].flags
>> DESC_DPL_SHIFT
) & HF_CPL_MASK
;
403 hflags
|= (env
->cr
[0] & CR0_PE_MASK
) << (HF_PE_SHIFT
- CR0_PE_SHIFT
);
404 hflags
|= (env
->cr
[0] << (HF_MP_SHIFT
- CR0_MP_SHIFT
)) &
405 (HF_MP_MASK
| HF_EM_MASK
| HF_TS_MASK
);
406 hflags
|= (env
->eflags
& (HF_TF_MASK
| HF_VM_MASK
| HF_IOPL_MASK
));
407 hflags
|= (env
->cr
[4] & CR4_OSFXSR_MASK
) <<
408 (HF_OSFXSR_SHIFT
- CR4_OSFXSR_SHIFT
);
410 if (env
->efer
& MSR_EFER_LMA
) {
411 hflags
|= HF_LMA_MASK
;
414 if ((hflags
& HF_LMA_MASK
) && (env
->segs
[R_CS
].flags
& DESC_L_MASK
)) {
415 hflags
|= HF_CS32_MASK
| HF_SS32_MASK
| HF_CS64_MASK
;
417 hflags
|= (env
->segs
[R_CS
].flags
& DESC_B_MASK
) >>
418 (DESC_B_SHIFT
- HF_CS32_SHIFT
);
419 hflags
|= (env
->segs
[R_SS
].flags
& DESC_B_MASK
) >>
420 (DESC_B_SHIFT
- HF_SS32_SHIFT
);
421 if (!(env
->cr
[0] & CR0_PE_MASK
) ||
422 (env
->eflags
& VM_MASK
) ||
423 !(hflags
& HF_CS32_MASK
)) {
424 hflags
|= HF_ADDSEG_MASK
;
426 hflags
|= ((env
->segs
[R_DS
].base
|
427 env
->segs
[R_ES
].base
|
428 env
->segs
[R_SS
].base
) != 0) <<
432 env
->hflags
= (env
->hflags
& HFLAG_COPY_MASK
) | hflags
;
433 env
->cc_src
= env
->eflags
& (CC_O
| CC_S
| CC_Z
| CC_A
| CC_P
| CC_C
);
434 env
->df
= 1 - (2 * ((env
->eflags
>> 10) & 1));
435 env
->cc_op
= CC_OP_EFLAGS
;
436 env
->eflags
&= ~(DF_MASK
| CC_O
| CC_S
| CC_Z
| CC_A
| CC_P
| CC_C
);
440 msrs
[n
++].index
= MSR_IA32_SYSENTER_CS
;
441 msrs
[n
++].index
= MSR_IA32_SYSENTER_ESP
;
442 msrs
[n
++].index
= MSR_IA32_SYSENTER_EIP
;
443 if (kvm_has_msr_star
)
444 msrs
[n
++].index
= MSR_STAR
;
445 msrs
[n
++].index
= MSR_IA32_TSC
;
447 if (lm_capable_kernel
) {
448 msrs
[n
++].index
= MSR_CSTAR
;
449 msrs
[n
++].index
= MSR_KERNELGSBASE
;
450 msrs
[n
++].index
= MSR_FMASK
;
451 msrs
[n
++].index
= MSR_LSTAR
;
454 rc
= kvm_get_msrs(kvm_context
, env
->cpu_index
, msrs
, n
);
456 perror("kvm_get_msrs FAILED");
459 n
= rc
; /* actual number of MSRs */
460 for (i
=0 ; i
<n
; i
++) {
461 if (get_msr_entry(&msrs
[i
], env
))
470 static int try_push_interrupts(void *opaque
)
472 CPUState
*env
= cpu_single_env
;
475 if (env
->ready_for_interrupt_injection
&&
476 (env
->interrupt_request
& CPU_INTERRUPT_HARD
) &&
477 (env
->eflags
& IF_MASK
)) {
478 env
->interrupt_request
&= ~CPU_INTERRUPT_HARD
;
479 irq
= cpu_get_pic_interrupt(env
);
481 r
= kvm_inject_irq(kvm_context
, env
->cpu_index
, irq
);
483 printf("cpu %d fail inject %x\n", env
->cpu_index
, irq
);
487 return (env
->interrupt_request
& CPU_INTERRUPT_HARD
) != 0;
490 static void post_kvm_run(void *opaque
, int vcpu
)
492 CPUState
*env
= vcpu_env
;
494 pthread_mutex_lock(&qemu_mutex
);
495 cpu_single_env
= env
;
496 env
->eflags
= kvm_get_interrupt_flag(kvm_context
, vcpu
)
497 ? env
->eflags
| IF_MASK
: env
->eflags
& ~IF_MASK
;
498 env
->ready_for_interrupt_injection
499 = kvm_is_ready_for_interrupt_injection(kvm_context
, vcpu
);
501 cpu_set_apic_tpr(env
, kvm_get_cr8(kvm_context
, vcpu
));
502 cpu_set_apic_base(env
, kvm_get_apic_base(kvm_context
, vcpu
));
505 static int pre_kvm_run(void *opaque
, int vcpu
)
507 CPUState
*env
= cpu_single_env
;
509 if (env
->cpu_index
== 0 && wait_hack
) {
514 pthread_mutex_unlock(&qemu_mutex
);
515 for (i
= 0; i
< 10; ++i
)
517 pthread_mutex_lock(&qemu_mutex
);
520 kvm_set_cr8(kvm_context
, vcpu
, cpu_get_apic_tpr(env
));
521 if (env
->interrupt_request
& CPU_INTERRUPT_EXIT
)
523 pthread_mutex_unlock(&qemu_mutex
);
527 void kvm_load_registers(CPUState
*env
)
533 void kvm_save_registers(CPUState
*env
)
539 int kvm_cpu_exec(CPUState
*env
)
543 r
= kvm_run(kvm_context
, env
->cpu_index
);
545 printf("kvm_run returned %d\n", r
);
552 extern int vm_running
;
554 static int has_work(CPUState
*env
)
558 if (!(env
->hflags
& HF_HALTED_MASK
))
560 if (env
->interrupt_request
& (CPU_INTERRUPT_HARD
| CPU_INTERRUPT_EXIT
))
565 static int kvm_eat_signal(CPUState
*env
, int timeout
)
572 ts
.tv_sec
= timeout
/ 1000;
573 ts
.tv_nsec
= (timeout
% 1000) * 1000000;
574 r
= sigtimedwait(&io_sigset
, &siginfo
, &ts
);
575 if (r
== -1 && (errno
== EAGAIN
|| errno
== EINTR
) && !timeout
)
578 pthread_mutex_lock(&qemu_mutex
);
579 cpu_single_env
= vcpu_env
;
580 if (r
== -1 && !(errno
== EAGAIN
|| errno
== EINTR
)) {
581 printf("sigtimedwait: %s\n", strerror(e
));
585 sigaction(siginfo
.si_signo
, NULL
, &sa
);
586 sa
.sa_handler(siginfo
.si_signo
);
589 pthread_mutex_unlock(&qemu_mutex
);
595 static void kvm_eat_signals(CPUState
*env
, int timeout
)
599 while (kvm_eat_signal(env
, 0))
602 r
= kvm_eat_signal(env
, timeout
);
604 while (kvm_eat_signal(env
, 0))
608 * we call select() even if no signal was received, to account for
609 * for which there is no signal handler installed.
611 pthread_mutex_lock(&qemu_mutex
);
612 cpu_single_env
= vcpu_env
;
614 pthread_mutex_unlock(&qemu_mutex
);
617 static void kvm_main_loop_wait(CPUState
*env
, int timeout
)
619 pthread_mutex_unlock(&qemu_mutex
);
620 if (env
->cpu_index
== 0)
621 kvm_eat_signals(env
, timeout
);
623 if (timeout
|| vcpu_info
[env
->cpu_index
].stopped
) {
629 sigaddset(&set
, SIG_IPI
);
639 sigaddset(&set
, SIG_IPI
);
640 sigtimedwait(&io_sigset
, &siginfo
, &ts
);
642 if (vcpu_info
[env
->cpu_index
].stop
) {
643 vcpu_info
[env
->cpu_index
].stop
= 0;
644 vcpu_info
[env
->cpu_index
].stopped
= 1;
645 pthread_kill(vcpu_info
[0].thread
, SIG_IPI
);
649 pthread_mutex_lock(&qemu_mutex
);
650 cpu_single_env
= env
;
651 vcpu_info
[env
->cpu_index
].signalled
= 0;
654 static int all_threads_paused(void)
658 for (i
= 1; i
< smp_cpus
; ++i
)
659 if (vcpu_info
[i
].stopped
)
664 static void pause_other_threads(void)
668 for (i
= 1; i
< smp_cpus
; ++i
) {
669 vcpu_info
[i
].stop
= 1;
670 pthread_kill(vcpu_info
[i
].thread
, SIG_IPI
);
672 while (!all_threads_paused())
673 kvm_eat_signals(vcpu_env
, 0);
676 static void resume_other_threads(void)
680 for (i
= 1; i
< smp_cpus
; ++i
) {
681 vcpu_info
[i
].stop
= 0;
682 vcpu_info
[i
].stopped
= 0;
683 pthread_kill(vcpu_info
[i
].thread
, SIG_IPI
);
687 static void kvm_vm_state_change_handler(void *context
, int running
)
690 resume_other_threads();
692 pause_other_threads();
695 static void update_regs_for_sipi(CPUState
*env
)
697 SegmentCache cs
= env
->segs
[R_CS
];
700 env
->segs
[R_CS
] = cs
;
703 vcpu_info
[env
->cpu_index
].sipi_needed
= 0;
704 vcpu_info
[env
->cpu_index
].init
= 0;
707 static void update_regs_for_init(CPUState
*env
)
713 static void setup_kernel_sigmask(CPUState
*env
)
717 sigprocmask(SIG_BLOCK
, NULL
, &set
);
718 sigdelset(&set
, SIG_IPI
);
719 if (env
->cpu_index
== 0)
720 sigandset(&set
, &set
, &io_negsigset
);
722 kvm_set_signal_mask(kvm_context
, env
->cpu_index
, &set
);
725 static int kvm_main_loop_cpu(CPUState
*env
)
727 struct vcpu_info
*info
= &vcpu_info
[env
->cpu_index
];
729 setup_kernel_sigmask(env
);
730 pthread_mutex_lock(&qemu_mutex
);
731 cpu_single_env
= env
;
733 while (!has_work(env
))
734 kvm_main_loop_wait(env
, 10);
735 if (env
->interrupt_request
& CPU_INTERRUPT_HARD
)
736 env
->hflags
&= ~HF_HALTED_MASK
;
737 if (info
->sipi_needed
)
738 update_regs_for_sipi(env
);
740 update_regs_for_init(env
);
741 if (!(env
->hflags
& HF_HALTED_MASK
) && !info
->init
)
743 env
->interrupt_request
&= ~CPU_INTERRUPT_EXIT
;
744 kvm_main_loop_wait(env
, 0);
745 if (qemu_shutdown_requested())
747 else if (qemu_powerdown_requested())
748 qemu_system_powerdown();
749 else if (qemu_reset_requested()) {
750 env
->interrupt_request
= 0;
755 pthread_mutex_unlock(&qemu_mutex
);
759 static void *ap_main_loop(void *_env
)
761 CPUState
*env
= _env
;
765 sigfillset(&signals
);
766 //sigdelset(&signals, SIG_IPI);
767 sigprocmask(SIG_BLOCK
, &signals
, NULL
);
768 kvm_create_vcpu(kvm_context
, env
->cpu_index
);
769 kvm_qemu_init_env(env
);
770 kvm_main_loop_cpu(env
);
774 static void kvm_add_signal(int signum
)
776 sigaddset(&io_sigset
, signum
);
777 sigdelset(&io_negsigset
, signum
);
778 sigprocmask(SIG_BLOCK
, &io_sigset
, NULL
);
781 int kvm_init_ap(void)
783 CPUState
*env
= first_cpu
->next_cpu
;
786 qemu_add_vm_change_state_handler(kvm_vm_state_change_handler
, NULL
);
787 sigemptyset(&io_sigset
);
788 sigfillset(&io_negsigset
);
789 kvm_add_signal(SIGIO
);
790 kvm_add_signal(SIGALRM
);
791 kvm_add_signal(SIGUSR2
);
792 kvm_add_signal(SIG_IPI
);
794 vcpu_env
= first_cpu
;
795 signal(SIG_IPI
, sig_ipi_handler
);
796 for (i
= 1; i
< smp_cpus
; ++i
) {
797 pthread_create(&vcpu_info
[i
].thread
, NULL
, ap_main_loop
, env
);
803 int kvm_main_loop(void)
805 vcpu_info
[0].thread
= pthread_self();
806 return kvm_main_loop_cpu(first_cpu
);
809 static int kvm_debug(void *opaque
, int vcpu
)
811 CPUState
*env
= cpu_single_env
;
813 env
->exception_index
= EXCP_DEBUG
;
817 static int kvm_inb(void *opaque
, uint16_t addr
, uint8_t *data
)
819 *data
= cpu_inb(0, addr
);
823 static int kvm_inw(void *opaque
, uint16_t addr
, uint16_t *data
)
825 *data
= cpu_inw(0, addr
);
829 static int kvm_inl(void *opaque
, uint16_t addr
, uint32_t *data
)
831 *data
= cpu_inl(0, addr
);
835 #define PM_IO_BASE 0xb000
837 static int kvm_outb(void *opaque
, uint16_t addr
, uint8_t data
)
842 cpu_outb(0, 0xb3, 0);
849 x
= cpu_inw(0, PM_IO_BASE
+ 4);
851 cpu_outw(0, PM_IO_BASE
+ 4, x
);
858 x
= cpu_inw(0, PM_IO_BASE
+ 4);
860 cpu_outw(0, PM_IO_BASE
+ 4, x
);
868 cpu_outb(0, addr
, data
);
872 static int kvm_outw(void *opaque
, uint16_t addr
, uint16_t data
)
874 cpu_outw(0, addr
, data
);
878 static int kvm_outl(void *opaque
, uint16_t addr
, uint32_t data
)
880 cpu_outl(0, addr
, data
);
884 static int kvm_readb(void *opaque
, uint64_t addr
, uint8_t *data
)
886 *data
= ldub_phys(addr
);
890 static int kvm_readw(void *opaque
, uint64_t addr
, uint16_t *data
)
892 *data
= lduw_phys(addr
);
896 static int kvm_readl(void *opaque
, uint64_t addr
, uint32_t *data
)
898 *data
= ldl_phys(addr
);
902 static int kvm_readq(void *opaque
, uint64_t addr
, uint64_t *data
)
904 *data
= ldq_phys(addr
);
908 static int kvm_writeb(void *opaque
, uint64_t addr
, uint8_t data
)
910 stb_phys(addr
, data
);
914 static int kvm_writew(void *opaque
, uint64_t addr
, uint16_t data
)
916 stw_phys(addr
, data
);
920 static int kvm_writel(void *opaque
, uint64_t addr
, uint32_t data
)
922 stl_phys(addr
, data
);
926 static int kvm_writeq(void *opaque
, uint64_t addr
, uint64_t data
)
928 stq_phys(addr
, data
);
932 static int kvm_io_window(void *opaque
)
938 static int kvm_halt(void *opaque
, int vcpu
)
940 CPUState
*env
= cpu_single_env
;
942 if (!((env
->interrupt_request
& CPU_INTERRUPT_HARD
) &&
943 (env
->eflags
& IF_MASK
))) {
944 env
->hflags
|= HF_HALTED_MASK
;
945 env
->exception_index
= EXCP_HLT
;
951 static int kvm_shutdown(void *opaque
, int vcpu
)
953 qemu_system_reset_request();
957 static struct kvm_callbacks qemu_kvm_ops
= {
969 .writeb
= kvm_writeb
,
970 .writew
= kvm_writew
,
971 .writel
= kvm_writel
,
972 .writeq
= kvm_writeq
,
974 .shutdown
= kvm_shutdown
,
975 .io_window
= kvm_io_window
,
976 .try_push_interrupts
= try_push_interrupts
,
977 .post_kvm_run
= post_kvm_run
,
978 .pre_kvm_run
= pre_kvm_run
,
983 /* Try to initialize kvm */
984 kvm_context
= kvm_init(&qemu_kvm_ops
, cpu_single_env
);
992 int kvm_qemu_create_context(void)
996 if (kvm_create(kvm_context
, phys_ram_size
, (void**)&phys_ram_base
) < 0) {
1000 kvm_msr_list
= kvm_get_msr_list(kvm_context
);
1001 if (!kvm_msr_list
) {
1005 for (i
= 0; i
< kvm_msr_list
->nmsrs
; ++i
)
1006 if (kvm_msr_list
->indices
[i
] == MSR_STAR
)
1007 kvm_has_msr_star
= 1;
1011 void kvm_qemu_destroy(void)
1013 kvm_finalize(kvm_context
);
1016 static void host_cpuid(uint32_t function
, uint32_t *eax
, uint32_t *ebx
,
1017 uint32_t *ecx
, uint32_t *edx
)
1024 "sub $128, %%rsp \n\t" /* skip red zone */
1025 "push %0; push %%rsi \n\t"
1026 "push %%rax; push %%rbx; push %%rcx; push %%rdx \n\t"
1027 "mov 8*5(%%rsp), %%rsi \n\t"
1028 "mov (%%rsi), %%eax \n\t"
1030 "mov %%eax, (%%rsi) \n\t"
1031 "mov %%ebx, 4(%%rsi) \n\t"
1032 "mov %%ecx, 8(%%rsi) \n\t"
1033 "mov %%edx, 12(%%rsi) \n\t"
1034 "pop %%rdx; pop %%rcx; pop %%rbx; pop %%rax \n\t"
1035 "pop %%rsi; pop %0 \n\t"
1038 "push %0; push %%esi \n\t"
1039 "push %%eax; push %%ebx; push %%ecx; push %%edx \n\t"
1040 "mov 4*5(%%esp), %%esi \n\t"
1041 "mov (%%esi), %%eax \n\t"
1043 "mov %%eax, (%%esi) \n\t"
1044 "mov %%ebx, 4(%%esi) \n\t"
1045 "mov %%ecx, 8(%%esi) \n\t"
1046 "mov %%edx, 12(%%esi) \n\t"
1047 "pop %%edx; pop %%ecx; pop %%ebx; pop %%eax \n\t"
1048 "pop %%esi; pop %0 \n\t"
1050 : : "rm"(vec
) : "memory");
1061 static void do_cpuid_ent(struct kvm_cpuid_entry
*e
, uint32_t function
,
1064 env
->regs
[R_EAX
] = function
;
1065 qemu_kvm_cpuid_on_env(env
);
1066 e
->function
= function
;
1067 e
->eax
= env
->regs
[R_EAX
];
1068 e
->ebx
= env
->regs
[R_EBX
];
1069 e
->ecx
= env
->regs
[R_ECX
];
1070 e
->edx
= env
->regs
[R_EDX
];
1071 if (function
== 0x80000001) {
1072 uint32_t h_eax
, h_edx
;
1073 struct utsname utsname
;
1075 host_cpuid(function
, &h_eax
, NULL
, NULL
, &h_edx
);
1077 lm_capable_kernel
= strcmp(utsname
.machine
, "x86_64") == 0;
1080 if ((h_edx
& 0x20000000) == 0 || !lm_capable_kernel
)
1081 e
->edx
&= ~0x20000000u
;
1083 if ((h_edx
& 0x00000800) == 0)
1084 e
->edx
&= ~0x00000800u
;
1086 if ((h_edx
& 0x00100000) == 0)
1087 e
->edx
&= ~0x00100000u
;
1089 // sysenter isn't supported on compatibility mode on AMD. and syscall
1090 // isn't supported in compatibility mode on Intel. so advertise the
1091 // actuall cpu, and say goodbye to migration between different vendors
1092 // is you use compatibility mode.
1093 if (function
== 0) {
1096 host_cpuid(0, NULL
, &bcd
[0], &bcd
[1], &bcd
[2]);
1103 int kvm_qemu_init_env(CPUState
*cenv
)
1105 struct kvm_cpuid_entry cpuid_ent
[100];
1112 copy
.regs
[R_EAX
] = 0;
1113 qemu_kvm_cpuid_on_env(©
);
1114 limit
= copy
.regs
[R_EAX
];
1116 for (i
= 0; i
<= limit
; ++i
)
1117 do_cpuid_ent(&cpuid_ent
[cpuid_nent
++], i
, ©
);
1119 copy
.regs
[R_EAX
] = 0x80000000;
1120 qemu_kvm_cpuid_on_env(©
);
1121 limit
= copy
.regs
[R_EAX
];
1123 for (i
= 0x80000000; i
<= limit
; ++i
)
1124 do_cpuid_ent(&cpuid_ent
[cpuid_nent
++], i
, ©
);
1126 kvm_setup_cpuid(kvm_context
, cenv
->cpu_index
, cpuid_nent
, cpuid_ent
);
1131 int kvm_update_debugger(CPUState
*env
)
1133 struct kvm_debug_guest dbg
;
1137 if (env
->nb_breakpoints
|| env
->singlestep_enabled
) {
1139 for (i
= 0; i
< 4 && i
< env
->nb_breakpoints
; ++i
) {
1140 dbg
.breakpoints
[i
].enabled
= 1;
1141 dbg
.breakpoints
[i
].address
= env
->breakpoints
[i
];
1143 dbg
.singlestep
= env
->singlestep_enabled
;
1145 return kvm_guest_debug(kvm_context
, env
->cpu_index
, &dbg
);
1150 * dirty pages logging
1152 /* FIXME: use unsigned long pointer instead of unsigned char */
1153 unsigned char *kvm_dirty_bitmap
= NULL
;
1154 int kvm_physical_memory_set_dirty_tracking(int enable
)
1162 if (!kvm_dirty_bitmap
) {
1163 unsigned bitmap_size
= BITMAP_SIZE(phys_ram_size
);
1164 kvm_dirty_bitmap
= qemu_malloc(bitmap_size
);
1165 if (kvm_dirty_bitmap
== NULL
) {
1166 perror("Failed to allocate dirty pages bitmap");
1170 r
= kvm_dirty_pages_log_enable_all(kvm_context
);
1175 if (kvm_dirty_bitmap
) {
1176 r
= kvm_dirty_pages_log_reset(kvm_context
);
1177 qemu_free(kvm_dirty_bitmap
);
1178 kvm_dirty_bitmap
= NULL
;
1184 /* get kvm's dirty pages bitmap and update qemu's */
1185 int kvm_get_dirty_pages_log_slot(int slot
,
1186 unsigned char *bitmap
,
1187 unsigned int offset
,
1191 unsigned int i
, j
, n
=0;
1193 unsigned page_number
, addr
, addr1
;
1195 memset(bitmap
, 0, len
);
1196 r
= kvm_get_dirty_pages(kvm_context
, slot
, bitmap
);
1201 * bitmap-traveling is faster than memory-traveling (for addr...)
1202 * especially when most of the memory is not dirty.
1204 for (i
=0; i
<len
; i
++) {
1209 page_number
= i
* 8 + j
;
1210 addr1
= page_number
* TARGET_PAGE_SIZE
;
1211 addr
= offset
+ addr1
;
1212 cpu_physical_memory_set_dirty(addr
);
1220 * get kvm's dirty pages bitmap and update qemu's
1221 * we only care about physical ram, which resides in slots 0 and 3
1223 int kvm_update_dirty_pages_log(void)
1227 len
= BITMAP_SIZE(0xa0000);
1228 r
= kvm_get_dirty_pages_log_slot(3, kvm_dirty_bitmap
, 0 , len
);
1229 len
= BITMAP_SIZE(phys_ram_size
- 0xc0000);
1230 r
= r
|| kvm_get_dirty_pages_log_slot(0, kvm_dirty_bitmap
, 0xc0000, len
);
1234 int kvm_get_phys_ram_page_bitmap(unsigned char *bitmap
)
1236 int r
=0, len
, offset
;
1238 len
= BITMAP_SIZE(phys_ram_size
);
1239 memset(bitmap
, 0, len
);
1241 r
= kvm_get_mem_map(kvm_context
, 3, bitmap
);
1245 offset
= BITMAP_SIZE(0xc0000);
1246 r
= kvm_get_mem_map(kvm_context
, 0, bitmap
+ offset
);
1252 int kvm_set_irq(int irq
, int level
)
1254 return kvm_set_irq_level(kvm_context
, irq
, level
);