3 #include "config-host.h"
6 #define KVM_ALLOWED_DEFAULT 1
8 #define KVM_ALLOWED_DEFAULT 0
11 int kvm_allowed
= KVM_ALLOWED_DEFAULT
;
12 static int lm_capable_kernel
;
23 #include <sys/utsname.h>
25 #define MSR_IA32_TSC 0x10
27 extern void perror(const char *s
);
29 kvm_context_t kvm_context
;
30 static struct kvm_msr_list
*kvm_msr_list
;
31 static int kvm_has_msr_star
;
34 extern unsigned int kvm_shadow_memory
;
36 pthread_mutex_t qemu_mutex
= PTHREAD_MUTEX_INITIALIZER
;
37 static __thread CPUState
*vcpu_env
;
39 static sigset_t io_sigset
, io_negsigset
;
43 #define SIG_IPI (SIGRTMIN+4)
54 static void sig_ipi_handler(int n
)
58 void kvm_update_interrupt_request(CPUState
*env
)
60 if (env
&& env
!= vcpu_env
) {
61 if (vcpu_info
[env
->cpu_index
].signalled
)
63 vcpu_info
[env
->cpu_index
].signalled
= 1;
64 if (vcpu_info
[env
->cpu_index
].thread
)
65 pthread_kill(vcpu_info
[env
->cpu_index
].thread
, SIG_IPI
);
69 void kvm_update_after_sipi(CPUState
*env
)
71 vcpu_info
[env
->cpu_index
].sipi_needed
= 1;
72 kvm_update_interrupt_request(env
);
75 * the qemu bios waits using a busy loop that's much too short for
76 * kvm. add a wait after the first sipi.
79 static int first_sipi
= 1;
88 void kvm_apic_init(CPUState
*env
)
90 if (env
->cpu_index
!= 0)
91 vcpu_info
[env
->cpu_index
].init
= 1;
92 kvm_update_interrupt_request(env
);
95 static void set_msr_entry(struct kvm_msr_entry
*entry
, uint32_t index
,
102 /* returns 0 on success, non-0 on failure */
103 static int get_msr_entry(struct kvm_msr_entry
*entry
, CPUState
*env
)
105 switch (entry
->index
) {
106 case MSR_IA32_SYSENTER_CS
:
107 env
->sysenter_cs
= entry
->data
;
109 case MSR_IA32_SYSENTER_ESP
:
110 env
->sysenter_esp
= entry
->data
;
112 case MSR_IA32_SYSENTER_EIP
:
113 env
->sysenter_eip
= entry
->data
;
116 env
->star
= entry
->data
;
120 env
->cstar
= entry
->data
;
122 case MSR_KERNELGSBASE
:
123 env
->kernelgsbase
= entry
->data
;
126 env
->fmask
= entry
->data
;
129 env
->lstar
= entry
->data
;
133 env
->tsc
= entry
->data
;
136 printf("Warning unknown msr index 0x%x\n", entry
->index
);
148 static void set_v8086_seg(struct kvm_segment
*lhs
, const SegmentCache
*rhs
)
150 lhs
->selector
= rhs
->selector
;
151 lhs
->base
= rhs
->base
;
152 lhs
->limit
= rhs
->limit
;
164 static void set_seg(struct kvm_segment
*lhs
, const SegmentCache
*rhs
)
166 unsigned flags
= rhs
->flags
;
167 lhs
->selector
= rhs
->selector
;
168 lhs
->base
= rhs
->base
;
169 lhs
->limit
= rhs
->limit
;
170 lhs
->type
= (flags
>> DESC_TYPE_SHIFT
) & 15;
171 lhs
->present
= (flags
& DESC_P_MASK
) != 0;
172 lhs
->dpl
= rhs
->selector
& 3;
173 lhs
->db
= (flags
>> DESC_B_SHIFT
) & 1;
174 lhs
->s
= (flags
& DESC_S_MASK
) != 0;
175 lhs
->l
= (flags
>> DESC_L_SHIFT
) & 1;
176 lhs
->g
= (flags
& DESC_G_MASK
) != 0;
177 lhs
->avl
= (flags
& DESC_AVL_MASK
) != 0;
181 static void get_seg(SegmentCache
*lhs
, const struct kvm_segment
*rhs
)
183 lhs
->selector
= rhs
->selector
;
184 lhs
->base
= rhs
->base
;
185 lhs
->limit
= rhs
->limit
;
187 (rhs
->type
<< DESC_TYPE_SHIFT
)
188 | (rhs
->present
* DESC_P_MASK
)
189 | (rhs
->dpl
<< DESC_DPL_SHIFT
)
190 | (rhs
->db
<< DESC_B_SHIFT
)
191 | (rhs
->s
* DESC_S_MASK
)
192 | (rhs
->l
<< DESC_L_SHIFT
)
193 | (rhs
->g
* DESC_G_MASK
)
194 | (rhs
->avl
* DESC_AVL_MASK
);
197 /* the reset values of qemu are not compatible to SVM
198 * this function is used to fix the segment descriptor values */
199 static void fix_realmode_dataseg(struct kvm_segment
*seg
)
206 static void load_regs(CPUState
*env
)
208 struct kvm_regs regs
;
210 struct kvm_sregs sregs
;
211 struct kvm_msr_entry msrs
[MSR_COUNT
];
214 regs
.rax
= env
->regs
[R_EAX
];
215 regs
.rbx
= env
->regs
[R_EBX
];
216 regs
.rcx
= env
->regs
[R_ECX
];
217 regs
.rdx
= env
->regs
[R_EDX
];
218 regs
.rsi
= env
->regs
[R_ESI
];
219 regs
.rdi
= env
->regs
[R_EDI
];
220 regs
.rsp
= env
->regs
[R_ESP
];
221 regs
.rbp
= env
->regs
[R_EBP
];
223 regs
.r8
= env
->regs
[8];
224 regs
.r9
= env
->regs
[9];
225 regs
.r10
= env
->regs
[10];
226 regs
.r11
= env
->regs
[11];
227 regs
.r12
= env
->regs
[12];
228 regs
.r13
= env
->regs
[13];
229 regs
.r14
= env
->regs
[14];
230 regs
.r15
= env
->regs
[15];
233 regs
.rflags
= env
->eflags
;
236 kvm_set_regs(kvm_context
, env
->cpu_index
, ®s
);
238 memset(&fpu
, 0, sizeof fpu
);
239 fpu
.fsw
= env
->fpus
& ~(7 << 11);
240 fpu
.fsw
|= (env
->fpstt
& 7) << 11;
242 for (i
= 0; i
< 8; ++i
)
243 fpu
.ftwx
|= (!env
->fptags
[i
]) << i
;
244 memcpy(fpu
.fpr
, env
->fpregs
, sizeof env
->fpregs
);
245 memcpy(fpu
.xmm
, env
->xmm_regs
, sizeof env
->xmm_regs
);
246 fpu
.mxcsr
= env
->mxcsr
;
247 kvm_set_fpu(kvm_context
, env
->cpu_index
, &fpu
);
249 memcpy(sregs
.interrupt_bitmap
, env
->kvm_interrupt_bitmap
, sizeof(sregs
.interrupt_bitmap
));
251 if ((env
->eflags
& VM_MASK
)) {
252 set_v8086_seg(&sregs
.cs
, &env
->segs
[R_CS
]);
253 set_v8086_seg(&sregs
.ds
, &env
->segs
[R_DS
]);
254 set_v8086_seg(&sregs
.es
, &env
->segs
[R_ES
]);
255 set_v8086_seg(&sregs
.fs
, &env
->segs
[R_FS
]);
256 set_v8086_seg(&sregs
.gs
, &env
->segs
[R_GS
]);
257 set_v8086_seg(&sregs
.ss
, &env
->segs
[R_SS
]);
259 set_seg(&sregs
.cs
, &env
->segs
[R_CS
]);
260 set_seg(&sregs
.ds
, &env
->segs
[R_DS
]);
261 set_seg(&sregs
.es
, &env
->segs
[R_ES
]);
262 set_seg(&sregs
.fs
, &env
->segs
[R_FS
]);
263 set_seg(&sregs
.gs
, &env
->segs
[R_GS
]);
264 set_seg(&sregs
.ss
, &env
->segs
[R_SS
]);
266 if (env
->cr
[0] & CR0_PE_MASK
) {
267 /* force ss cpl to cs cpl */
268 sregs
.ss
.selector
= (sregs
.ss
.selector
& ~3) |
269 (sregs
.cs
.selector
& 3);
270 sregs
.ss
.dpl
= sregs
.ss
.selector
& 3;
273 if (!(env
->cr
[0] & CR0_PG_MASK
)) {
274 fix_realmode_dataseg(&sregs
.cs
);
275 fix_realmode_dataseg(&sregs
.ds
);
276 fix_realmode_dataseg(&sregs
.es
);
277 fix_realmode_dataseg(&sregs
.fs
);
278 fix_realmode_dataseg(&sregs
.gs
);
279 fix_realmode_dataseg(&sregs
.ss
);
283 set_seg(&sregs
.tr
, &env
->tr
);
284 set_seg(&sregs
.ldt
, &env
->ldt
);
286 sregs
.idt
.limit
= env
->idt
.limit
;
287 sregs
.idt
.base
= env
->idt
.base
;
288 sregs
.gdt
.limit
= env
->gdt
.limit
;
289 sregs
.gdt
.base
= env
->gdt
.base
;
291 sregs
.cr0
= env
->cr
[0];
292 sregs
.cr2
= env
->cr
[2];
293 sregs
.cr3
= env
->cr
[3];
294 sregs
.cr4
= env
->cr
[4];
296 sregs
.apic_base
= cpu_get_apic_base(env
);
297 sregs
.efer
= env
->efer
;
298 sregs
.cr8
= cpu_get_apic_tpr(env
);
300 kvm_set_sregs(kvm_context
, env
->cpu_index
, &sregs
);
304 set_msr_entry(&msrs
[n
++], MSR_IA32_SYSENTER_CS
, env
->sysenter_cs
);
305 set_msr_entry(&msrs
[n
++], MSR_IA32_SYSENTER_ESP
, env
->sysenter_esp
);
306 set_msr_entry(&msrs
[n
++], MSR_IA32_SYSENTER_EIP
, env
->sysenter_eip
);
307 if (kvm_has_msr_star
)
308 set_msr_entry(&msrs
[n
++], MSR_STAR
, env
->star
);
309 set_msr_entry(&msrs
[n
++], MSR_IA32_TSC
, env
->tsc
);
311 if (lm_capable_kernel
) {
312 set_msr_entry(&msrs
[n
++], MSR_CSTAR
, env
->cstar
);
313 set_msr_entry(&msrs
[n
++], MSR_KERNELGSBASE
, env
->kernelgsbase
);
314 set_msr_entry(&msrs
[n
++], MSR_FMASK
, env
->fmask
);
315 set_msr_entry(&msrs
[n
++], MSR_LSTAR
, env
->lstar
);
319 rc
= kvm_set_msrs(kvm_context
, env
->cpu_index
, msrs
, n
);
321 perror("kvm_set_msrs FAILED");
325 static void save_regs(CPUState
*env
)
327 struct kvm_regs regs
;
329 struct kvm_sregs sregs
;
330 struct kvm_msr_entry msrs
[MSR_COUNT
];
334 kvm_get_regs(kvm_context
, env
->cpu_index
, ®s
);
336 env
->regs
[R_EAX
] = regs
.rax
;
337 env
->regs
[R_EBX
] = regs
.rbx
;
338 env
->regs
[R_ECX
] = regs
.rcx
;
339 env
->regs
[R_EDX
] = regs
.rdx
;
340 env
->regs
[R_ESI
] = regs
.rsi
;
341 env
->regs
[R_EDI
] = regs
.rdi
;
342 env
->regs
[R_ESP
] = regs
.rsp
;
343 env
->regs
[R_EBP
] = regs
.rbp
;
345 env
->regs
[8] = regs
.r8
;
346 env
->regs
[9] = regs
.r9
;
347 env
->regs
[10] = regs
.r10
;
348 env
->regs
[11] = regs
.r11
;
349 env
->regs
[12] = regs
.r12
;
350 env
->regs
[13] = regs
.r13
;
351 env
->regs
[14] = regs
.r14
;
352 env
->regs
[15] = regs
.r15
;
355 env
->eflags
= regs
.rflags
;
358 kvm_get_fpu(kvm_context
, env
->cpu_index
, &fpu
);
359 env
->fpstt
= (fpu
.fsw
>> 11) & 7;
362 for (i
= 0; i
< 8; ++i
)
363 env
->fptags
[i
] = !((fpu
.ftwx
>> i
) & 1);
364 memcpy(env
->fpregs
, fpu
.fpr
, sizeof env
->fpregs
);
365 memcpy(env
->xmm_regs
, fpu
.xmm
, sizeof env
->xmm_regs
);
366 env
->mxcsr
= fpu
.mxcsr
;
368 kvm_get_sregs(kvm_context
, env
->cpu_index
, &sregs
);
370 memcpy(env
->kvm_interrupt_bitmap
, sregs
.interrupt_bitmap
, sizeof(env
->kvm_interrupt_bitmap
));
372 get_seg(&env
->segs
[R_CS
], &sregs
.cs
);
373 get_seg(&env
->segs
[R_DS
], &sregs
.ds
);
374 get_seg(&env
->segs
[R_ES
], &sregs
.es
);
375 get_seg(&env
->segs
[R_FS
], &sregs
.fs
);
376 get_seg(&env
->segs
[R_GS
], &sregs
.gs
);
377 get_seg(&env
->segs
[R_SS
], &sregs
.ss
);
379 get_seg(&env
->tr
, &sregs
.tr
);
380 get_seg(&env
->ldt
, &sregs
.ldt
);
382 env
->idt
.limit
= sregs
.idt
.limit
;
383 env
->idt
.base
= sregs
.idt
.base
;
384 env
->gdt
.limit
= sregs
.gdt
.limit
;
385 env
->gdt
.base
= sregs
.gdt
.base
;
387 env
->cr
[0] = sregs
.cr0
;
388 env
->cr
[2] = sregs
.cr2
;
389 env
->cr
[3] = sregs
.cr3
;
390 env
->cr
[4] = sregs
.cr4
;
392 cpu_set_apic_base(env
, sregs
.apic_base
);
394 env
->efer
= sregs
.efer
;
395 //cpu_set_apic_tpr(env, sregs.cr8);
397 #define HFLAG_COPY_MASK ~( \
398 HF_CPL_MASK | HF_PE_MASK | HF_MP_MASK | HF_EM_MASK | \
399 HF_TS_MASK | HF_TF_MASK | HF_VM_MASK | HF_IOPL_MASK | \
400 HF_OSFXSR_MASK | HF_LMA_MASK | HF_CS32_MASK | \
401 HF_SS32_MASK | HF_CS64_MASK | HF_ADDSEG_MASK)
405 hflags
= (env
->segs
[R_CS
].flags
>> DESC_DPL_SHIFT
) & HF_CPL_MASK
;
406 hflags
|= (env
->cr
[0] & CR0_PE_MASK
) << (HF_PE_SHIFT
- CR0_PE_SHIFT
);
407 hflags
|= (env
->cr
[0] << (HF_MP_SHIFT
- CR0_MP_SHIFT
)) &
408 (HF_MP_MASK
| HF_EM_MASK
| HF_TS_MASK
);
409 hflags
|= (env
->eflags
& (HF_TF_MASK
| HF_VM_MASK
| HF_IOPL_MASK
));
410 hflags
|= (env
->cr
[4] & CR4_OSFXSR_MASK
) <<
411 (HF_OSFXSR_SHIFT
- CR4_OSFXSR_SHIFT
);
413 if (env
->efer
& MSR_EFER_LMA
) {
414 hflags
|= HF_LMA_MASK
;
417 if ((hflags
& HF_LMA_MASK
) && (env
->segs
[R_CS
].flags
& DESC_L_MASK
)) {
418 hflags
|= HF_CS32_MASK
| HF_SS32_MASK
| HF_CS64_MASK
;
420 hflags
|= (env
->segs
[R_CS
].flags
& DESC_B_MASK
) >>
421 (DESC_B_SHIFT
- HF_CS32_SHIFT
);
422 hflags
|= (env
->segs
[R_SS
].flags
& DESC_B_MASK
) >>
423 (DESC_B_SHIFT
- HF_SS32_SHIFT
);
424 if (!(env
->cr
[0] & CR0_PE_MASK
) ||
425 (env
->eflags
& VM_MASK
) ||
426 !(hflags
& HF_CS32_MASK
)) {
427 hflags
|= HF_ADDSEG_MASK
;
429 hflags
|= ((env
->segs
[R_DS
].base
|
430 env
->segs
[R_ES
].base
|
431 env
->segs
[R_SS
].base
) != 0) <<
435 env
->hflags
= (env
->hflags
& HFLAG_COPY_MASK
) | hflags
;
436 env
->cc_src
= env
->eflags
& (CC_O
| CC_S
| CC_Z
| CC_A
| CC_P
| CC_C
);
437 env
->df
= 1 - (2 * ((env
->eflags
>> 10) & 1));
438 env
->cc_op
= CC_OP_EFLAGS
;
439 env
->eflags
&= ~(DF_MASK
| CC_O
| CC_S
| CC_Z
| CC_A
| CC_P
| CC_C
);
443 msrs
[n
++].index
= MSR_IA32_SYSENTER_CS
;
444 msrs
[n
++].index
= MSR_IA32_SYSENTER_ESP
;
445 msrs
[n
++].index
= MSR_IA32_SYSENTER_EIP
;
446 if (kvm_has_msr_star
)
447 msrs
[n
++].index
= MSR_STAR
;
448 msrs
[n
++].index
= MSR_IA32_TSC
;
450 if (lm_capable_kernel
) {
451 msrs
[n
++].index
= MSR_CSTAR
;
452 msrs
[n
++].index
= MSR_KERNELGSBASE
;
453 msrs
[n
++].index
= MSR_FMASK
;
454 msrs
[n
++].index
= MSR_LSTAR
;
457 rc
= kvm_get_msrs(kvm_context
, env
->cpu_index
, msrs
, n
);
459 perror("kvm_get_msrs FAILED");
462 n
= rc
; /* actual number of MSRs */
463 for (i
=0 ; i
<n
; i
++) {
464 if (get_msr_entry(&msrs
[i
], env
))
473 static int try_push_interrupts(void *opaque
)
475 CPUState
*env
= cpu_single_env
;
478 if (env
->ready_for_interrupt_injection
&&
479 (env
->interrupt_request
& CPU_INTERRUPT_HARD
) &&
480 (env
->eflags
& IF_MASK
)) {
481 env
->interrupt_request
&= ~CPU_INTERRUPT_HARD
;
482 irq
= cpu_get_pic_interrupt(env
);
484 r
= kvm_inject_irq(kvm_context
, env
->cpu_index
, irq
);
486 printf("cpu %d fail inject %x\n", env
->cpu_index
, irq
);
490 return (env
->interrupt_request
& CPU_INTERRUPT_HARD
) != 0;
493 static void post_kvm_run(void *opaque
, int vcpu
)
495 CPUState
*env
= vcpu_env
;
497 pthread_mutex_lock(&qemu_mutex
);
498 cpu_single_env
= env
;
499 env
->eflags
= kvm_get_interrupt_flag(kvm_context
, vcpu
)
500 ? env
->eflags
| IF_MASK
: env
->eflags
& ~IF_MASK
;
501 env
->ready_for_interrupt_injection
502 = kvm_is_ready_for_interrupt_injection(kvm_context
, vcpu
);
504 cpu_set_apic_tpr(env
, kvm_get_cr8(kvm_context
, vcpu
));
505 cpu_set_apic_base(env
, kvm_get_apic_base(kvm_context
, vcpu
));
508 static int pre_kvm_run(void *opaque
, int vcpu
)
510 CPUState
*env
= cpu_single_env
;
512 if (env
->cpu_index
== 0 && wait_hack
) {
517 pthread_mutex_unlock(&qemu_mutex
);
518 for (i
= 0; i
< 10; ++i
)
520 pthread_mutex_lock(&qemu_mutex
);
523 if (!kvm_irqchip_in_kernel(kvm_context
))
524 kvm_set_cr8(kvm_context
, vcpu
, cpu_get_apic_tpr(env
));
525 if (env
->interrupt_request
& CPU_INTERRUPT_EXIT
)
527 pthread_mutex_unlock(&qemu_mutex
);
531 void kvm_load_registers(CPUState
*env
)
537 void kvm_save_registers(CPUState
*env
)
543 int kvm_cpu_exec(CPUState
*env
)
547 r
= kvm_run(kvm_context
, env
->cpu_index
);
549 printf("kvm_run returned %d\n", r
);
556 extern int vm_running
;
558 static int has_work(CPUState
*env
)
562 if (!(env
->hflags
& HF_HALTED_MASK
))
564 if ((env
->interrupt_request
& (CPU_INTERRUPT_HARD
| CPU_INTERRUPT_EXIT
)) &&
565 (env
->eflags
& IF_MASK
))
570 static int kvm_eat_signal(CPUState
*env
, int timeout
)
577 ts
.tv_sec
= timeout
/ 1000;
578 ts
.tv_nsec
= (timeout
% 1000) * 1000000;
579 r
= sigtimedwait(&io_sigset
, &siginfo
, &ts
);
580 if (r
== -1 && (errno
== EAGAIN
|| errno
== EINTR
) && !timeout
)
583 pthread_mutex_lock(&qemu_mutex
);
584 cpu_single_env
= vcpu_env
;
585 if (r
== -1 && !(errno
== EAGAIN
|| errno
== EINTR
)) {
586 printf("sigtimedwait: %s\n", strerror(e
));
590 sigaction(siginfo
.si_signo
, NULL
, &sa
);
591 sa
.sa_handler(siginfo
.si_signo
);
594 pthread_mutex_unlock(&qemu_mutex
);
600 static void kvm_eat_signals(CPUState
*env
, int timeout
)
604 while (kvm_eat_signal(env
, 0))
607 r
= kvm_eat_signal(env
, timeout
);
609 while (kvm_eat_signal(env
, 0))
613 * we call select() even if no signal was received, to account for
614 * for which there is no signal handler installed.
616 pthread_mutex_lock(&qemu_mutex
);
617 cpu_single_env
= vcpu_env
;
619 pthread_mutex_unlock(&qemu_mutex
);
622 static void kvm_main_loop_wait(CPUState
*env
, int timeout
)
624 pthread_mutex_unlock(&qemu_mutex
);
625 if (env
->cpu_index
== 0)
626 kvm_eat_signals(env
, timeout
);
628 if (!kvm_irqchip_in_kernel(kvm_context
) &&
629 (timeout
|| vcpu_info
[env
->cpu_index
].stopped
)) {
635 sigaddset(&set
, SIG_IPI
);
645 sigaddset(&set
, SIG_IPI
);
646 sigtimedwait(&set
, &siginfo
, &ts
);
648 if (vcpu_info
[env
->cpu_index
].stop
) {
649 vcpu_info
[env
->cpu_index
].stop
= 0;
650 vcpu_info
[env
->cpu_index
].stopped
= 1;
651 pthread_kill(vcpu_info
[0].thread
, SIG_IPI
);
655 pthread_mutex_lock(&qemu_mutex
);
656 cpu_single_env
= env
;
657 vcpu_info
[env
->cpu_index
].signalled
= 0;
660 static int all_threads_paused(void)
664 for (i
= 1; i
< smp_cpus
; ++i
)
665 if (vcpu_info
[i
].stopped
)
670 static void pause_other_threads(void)
674 for (i
= 1; i
< smp_cpus
; ++i
) {
675 vcpu_info
[i
].stop
= 1;
676 pthread_kill(vcpu_info
[i
].thread
, SIG_IPI
);
678 while (!all_threads_paused())
679 kvm_eat_signals(vcpu_env
, 0);
682 static void resume_other_threads(void)
686 for (i
= 1; i
< smp_cpus
; ++i
) {
687 vcpu_info
[i
].stop
= 0;
688 vcpu_info
[i
].stopped
= 0;
689 pthread_kill(vcpu_info
[i
].thread
, SIG_IPI
);
693 static void kvm_vm_state_change_handler(void *context
, int running
)
696 resume_other_threads();
698 pause_other_threads();
701 static void update_regs_for_sipi(CPUState
*env
)
703 SegmentCache cs
= env
->segs
[R_CS
];
706 env
->segs
[R_CS
] = cs
;
709 vcpu_info
[env
->cpu_index
].sipi_needed
= 0;
710 vcpu_info
[env
->cpu_index
].init
= 0;
713 static void update_regs_for_init(CPUState
*env
)
719 static void setup_kernel_sigmask(CPUState
*env
)
723 sigprocmask(SIG_BLOCK
, NULL
, &set
);
724 sigdelset(&set
, SIG_IPI
);
725 if (env
->cpu_index
== 0)
726 sigandset(&set
, &set
, &io_negsigset
);
728 kvm_set_signal_mask(kvm_context
, env
->cpu_index
, &set
);
731 static int kvm_main_loop_cpu(CPUState
*env
)
733 struct vcpu_info
*info
= &vcpu_info
[env
->cpu_index
];
735 setup_kernel_sigmask(env
);
736 pthread_mutex_lock(&qemu_mutex
);
737 cpu_single_env
= env
;
739 while (!has_work(env
))
740 kvm_main_loop_wait(env
, 10);
741 if (env
->interrupt_request
& CPU_INTERRUPT_HARD
)
742 env
->hflags
&= ~HF_HALTED_MASK
;
743 if (!kvm_irqchip_in_kernel(kvm_context
) && info
->sipi_needed
)
744 update_regs_for_sipi(env
);
745 if (!kvm_irqchip_in_kernel(kvm_context
) && info
->init
)
746 update_regs_for_init(env
);
747 if (!(env
->hflags
& HF_HALTED_MASK
) && !info
->init
)
749 env
->interrupt_request
&= ~CPU_INTERRUPT_EXIT
;
750 kvm_main_loop_wait(env
, 0);
751 if (qemu_shutdown_requested())
753 else if (qemu_powerdown_requested())
754 qemu_system_powerdown();
755 else if (qemu_reset_requested()) {
756 env
->interrupt_request
= 0;
761 pthread_mutex_unlock(&qemu_mutex
);
765 static void *ap_main_loop(void *_env
)
767 CPUState
*env
= _env
;
771 sigfillset(&signals
);
772 //sigdelset(&signals, SIG_IPI);
773 sigprocmask(SIG_BLOCK
, &signals
, NULL
);
774 kvm_create_vcpu(kvm_context
, env
->cpu_index
);
775 kvm_qemu_init_env(env
);
776 if (kvm_irqchip_in_kernel(kvm_context
))
777 env
->hflags
&= ~HF_HALTED_MASK
;
778 kvm_main_loop_cpu(env
);
782 static void kvm_add_signal(int signum
)
784 sigaddset(&io_sigset
, signum
);
785 sigdelset(&io_negsigset
, signum
);
786 sigprocmask(SIG_BLOCK
, &io_sigset
, NULL
);
789 int kvm_init_ap(void)
791 CPUState
*env
= first_cpu
->next_cpu
;
794 qemu_add_vm_change_state_handler(kvm_vm_state_change_handler
, NULL
);
795 sigemptyset(&io_sigset
);
796 sigfillset(&io_negsigset
);
797 kvm_add_signal(SIGIO
);
798 kvm_add_signal(SIGALRM
);
799 kvm_add_signal(SIGUSR2
);
800 if (!kvm_irqchip_in_kernel(kvm_context
))
801 kvm_add_signal(SIG_IPI
);
803 vcpu_env
= first_cpu
;
804 signal(SIG_IPI
, sig_ipi_handler
);
805 for (i
= 1; i
< smp_cpus
; ++i
) {
806 pthread_create(&vcpu_info
[i
].thread
, NULL
, ap_main_loop
, env
);
812 int kvm_main_loop(void)
814 vcpu_info
[0].thread
= pthread_self();
815 return kvm_main_loop_cpu(first_cpu
);
818 static int kvm_debug(void *opaque
, int vcpu
)
820 CPUState
*env
= cpu_single_env
;
822 env
->exception_index
= EXCP_DEBUG
;
826 static int kvm_inb(void *opaque
, uint16_t addr
, uint8_t *data
)
828 *data
= cpu_inb(0, addr
);
832 static int kvm_inw(void *opaque
, uint16_t addr
, uint16_t *data
)
834 *data
= cpu_inw(0, addr
);
838 static int kvm_inl(void *opaque
, uint16_t addr
, uint32_t *data
)
840 *data
= cpu_inl(0, addr
);
844 #define PM_IO_BASE 0xb000
846 static int kvm_outb(void *opaque
, uint16_t addr
, uint8_t data
)
851 cpu_outb(0, 0xb3, 0);
858 x
= cpu_inw(0, PM_IO_BASE
+ 4);
860 cpu_outw(0, PM_IO_BASE
+ 4, x
);
867 x
= cpu_inw(0, PM_IO_BASE
+ 4);
869 cpu_outw(0, PM_IO_BASE
+ 4, x
);
877 cpu_outb(0, addr
, data
);
881 static int kvm_outw(void *opaque
, uint16_t addr
, uint16_t data
)
883 cpu_outw(0, addr
, data
);
887 static int kvm_outl(void *opaque
, uint16_t addr
, uint32_t data
)
889 cpu_outl(0, addr
, data
);
893 static int kvm_readb(void *opaque
, uint64_t addr
, uint8_t *data
)
895 *data
= ldub_phys(addr
);
899 static int kvm_readw(void *opaque
, uint64_t addr
, uint16_t *data
)
901 *data
= lduw_phys(addr
);
905 static int kvm_readl(void *opaque
, uint64_t addr
, uint32_t *data
)
907 /* hack: Red Hat 7.1 generates some wierd accesses. */
908 if (addr
> 0xa0000 - 4 && addr
< 0xa0000) {
913 *data
= ldl_phys(addr
);
917 static int kvm_readq(void *opaque
, uint64_t addr
, uint64_t *data
)
919 *data
= ldq_phys(addr
);
923 static int kvm_writeb(void *opaque
, uint64_t addr
, uint8_t data
)
925 stb_phys(addr
, data
);
929 static int kvm_writew(void *opaque
, uint64_t addr
, uint16_t data
)
931 stw_phys(addr
, data
);
935 static int kvm_writel(void *opaque
, uint64_t addr
, uint32_t data
)
937 stl_phys(addr
, data
);
941 static int kvm_writeq(void *opaque
, uint64_t addr
, uint64_t data
)
943 stq_phys(addr
, data
);
947 static int kvm_io_window(void *opaque
)
953 static int kvm_halt(void *opaque
, int vcpu
)
955 CPUState
*env
= cpu_single_env
;
957 if (!((env
->interrupt_request
& CPU_INTERRUPT_HARD
) &&
958 (env
->eflags
& IF_MASK
))) {
959 env
->hflags
|= HF_HALTED_MASK
;
960 env
->exception_index
= EXCP_HLT
;
966 static int kvm_shutdown(void *opaque
, int vcpu
)
968 qemu_system_reset_request();
972 static struct kvm_callbacks qemu_kvm_ops
= {
984 .writeb
= kvm_writeb
,
985 .writew
= kvm_writew
,
986 .writel
= kvm_writel
,
987 .writeq
= kvm_writeq
,
989 .shutdown
= kvm_shutdown
,
990 .io_window
= kvm_io_window
,
991 .try_push_interrupts
= try_push_interrupts
,
992 .post_kvm_run
= post_kvm_run
,
993 .pre_kvm_run
= pre_kvm_run
,
998 /* Try to initialize kvm */
999 kvm_context
= kvm_init(&qemu_kvm_ops
, cpu_single_env
);
1007 int kvm_qemu_create_context(void)
1012 kvm_disable_irqchip_creation(kvm_context
);
1014 if (kvm_create(kvm_context
, phys_ram_size
, (void**)&phys_ram_base
) < 0) {
1018 if (kvm_shadow_memory
)
1019 kvm_set_shadow_pages(kvm_context
, kvm_shadow_memory
);
1020 kvm_msr_list
= kvm_get_msr_list(kvm_context
);
1021 if (!kvm_msr_list
) {
1025 for (i
= 0; i
< kvm_msr_list
->nmsrs
; ++i
)
1026 if (kvm_msr_list
->indices
[i
] == MSR_STAR
)
1027 kvm_has_msr_star
= 1;
1031 void kvm_qemu_destroy(void)
1033 kvm_finalize(kvm_context
);
1036 void kvm_cpu_register_physical_memory(target_phys_addr_t start_addr
,
1038 unsigned long phys_offset
)
1040 #ifdef KVM_CAP_USER_MEMORY
1043 r
= kvm_check_extension(kvm_context
, KVM_CAP_USER_MEMORY
);
1045 if (!(phys_offset
& ~TARGET_PAGE_MASK
)) {
1046 r
= kvm_is_allocated_mem(kvm_context
, start_addr
, size
);
1049 r
= kvm_is_intersecting_mem(kvm_context
, start_addr
);
1051 kvm_create_mem_hole(kvm_context
, start_addr
, size
);
1052 r
= kvm_register_userspace_phys_mem(kvm_context
, start_addr
,
1053 phys_ram_base
+ phys_offset
,
1056 if (phys_offset
& IO_MEM_ROM
) {
1057 phys_offset
&= ~IO_MEM_ROM
;
1058 r
= kvm_is_intersecting_mem(kvm_context
, start_addr
);
1060 kvm_create_mem_hole(kvm_context
, start_addr
, size
);
1061 r
= kvm_register_userspace_phys_mem(kvm_context
, start_addr
,
1062 phys_ram_base
+ phys_offset
,
1066 printf("kvm_cpu_register_physical_memory: failed\n");
1072 if (phys_offset
& IO_MEM_ROM
) {
1073 phys_offset
&= ~IO_MEM_ROM
;
1074 memcpy(phys_ram_base
+ start_addr
, phys_ram_base
+ phys_offset
, size
);
1078 int kvm_qemu_check_extension(int ext
)
1080 return kvm_check_extension(kvm_context
, ext
);
1083 static void host_cpuid(uint32_t function
, uint32_t *eax
, uint32_t *ebx
,
1084 uint32_t *ecx
, uint32_t *edx
)
1091 "sub $128, %%rsp \n\t" /* skip red zone */
1092 "push %0; push %%rsi \n\t"
1093 "push %%rax; push %%rbx; push %%rcx; push %%rdx \n\t"
1094 "mov 8*5(%%rsp), %%rsi \n\t"
1095 "mov (%%rsi), %%eax \n\t"
1097 "mov %%eax, (%%rsi) \n\t"
1098 "mov %%ebx, 4(%%rsi) \n\t"
1099 "mov %%ecx, 8(%%rsi) \n\t"
1100 "mov %%edx, 12(%%rsi) \n\t"
1101 "pop %%rdx; pop %%rcx; pop %%rbx; pop %%rax \n\t"
1102 "pop %%rsi; pop %0 \n\t"
1105 "push %0; push %%esi \n\t"
1106 "push %%eax; push %%ebx; push %%ecx; push %%edx \n\t"
1107 "mov 4*5(%%esp), %%esi \n\t"
1108 "mov (%%esi), %%eax \n\t"
1110 "mov %%eax, (%%esi) \n\t"
1111 "mov %%ebx, 4(%%esi) \n\t"
1112 "mov %%ecx, 8(%%esi) \n\t"
1113 "mov %%edx, 12(%%esi) \n\t"
1114 "pop %%edx; pop %%ecx; pop %%ebx; pop %%eax \n\t"
1115 "pop %%esi; pop %0 \n\t"
1117 : : "rm"(vec
) : "memory");
1128 static void do_cpuid_ent(struct kvm_cpuid_entry
*e
, uint32_t function
,
1131 env
->regs
[R_EAX
] = function
;
1132 qemu_kvm_cpuid_on_env(env
);
1133 e
->function
= function
;
1134 e
->eax
= env
->regs
[R_EAX
];
1135 e
->ebx
= env
->regs
[R_EBX
];
1136 e
->ecx
= env
->regs
[R_ECX
];
1137 e
->edx
= env
->regs
[R_EDX
];
1138 if (function
== 0x80000001) {
1139 uint32_t h_eax
, h_edx
;
1140 struct utsname utsname
;
1142 host_cpuid(function
, &h_eax
, NULL
, NULL
, &h_edx
);
1144 lm_capable_kernel
= strcmp(utsname
.machine
, "x86_64") == 0;
1147 if ((h_edx
& 0x20000000) == 0 || !lm_capable_kernel
)
1148 e
->edx
&= ~0x20000000u
;
1150 if ((h_edx
& 0x00000800) == 0)
1151 e
->edx
&= ~0x00000800u
;
1153 if ((h_edx
& 0x00100000) == 0)
1154 e
->edx
&= ~0x00100000u
;
1159 // sysenter isn't supported on compatibility mode on AMD. and syscall
1160 // isn't supported in compatibility mode on Intel. so advertise the
1161 // actuall cpu, and say goodbye to migration between different vendors
1162 // is you use compatibility mode.
1163 if (function
== 0) {
1166 host_cpuid(0, NULL
, &bcd
[0], &bcd
[1], &bcd
[2]);
1173 int kvm_qemu_init_env(CPUState
*cenv
)
1175 struct kvm_cpuid_entry cpuid_ent
[100];
1176 #ifdef KVM_CPUID_SIGNATURE
1177 struct kvm_cpuid_entry
*pv_ent
;
1178 uint32_t signature
[3];
1186 #ifdef KVM_CPUID_SIGNATURE
1187 /* Paravirtualization CPUIDs */
1188 memcpy(signature
, "KVMKVMKVM", 12);
1189 pv_ent
= &cpuid_ent
[cpuid_nent
++];
1190 memset(pv_ent
, 0, sizeof(*pv_ent
));
1191 pv_ent
->function
= KVM_CPUID_SIGNATURE
;
1193 pv_ent
->ebx
= signature
[0];
1194 pv_ent
->ecx
= signature
[1];
1195 pv_ent
->edx
= signature
[2];
1197 pv_ent
= &cpuid_ent
[cpuid_nent
++];
1198 memset(pv_ent
, 0, sizeof(*pv_ent
));
1199 pv_ent
->function
= KVM_CPUID_FEATURES
;
1203 copy
.regs
[R_EAX
] = 0;
1204 qemu_kvm_cpuid_on_env(©
);
1205 limit
= copy
.regs
[R_EAX
];
1207 for (i
= 0; i
<= limit
; ++i
)
1208 do_cpuid_ent(&cpuid_ent
[cpuid_nent
++], i
, ©
);
1210 copy
.regs
[R_EAX
] = 0x80000000;
1211 qemu_kvm_cpuid_on_env(©
);
1212 limit
= copy
.regs
[R_EAX
];
1214 for (i
= 0x80000000; i
<= limit
; ++i
)
1215 do_cpuid_ent(&cpuid_ent
[cpuid_nent
++], i
, ©
);
1217 kvm_setup_cpuid(kvm_context
, cenv
->cpu_index
, cpuid_nent
, cpuid_ent
);
1222 int kvm_update_debugger(CPUState
*env
)
1224 struct kvm_debug_guest dbg
;
1228 if (env
->nb_breakpoints
|| env
->singlestep_enabled
) {
1230 for (i
= 0; i
< 4 && i
< env
->nb_breakpoints
; ++i
) {
1231 dbg
.breakpoints
[i
].enabled
= 1;
1232 dbg
.breakpoints
[i
].address
= env
->breakpoints
[i
];
1234 dbg
.singlestep
= env
->singlestep_enabled
;
1236 return kvm_guest_debug(kvm_context
, env
->cpu_index
, &dbg
);
1241 * dirty pages logging
1243 /* FIXME: use unsigned long pointer instead of unsigned char */
1244 unsigned char *kvm_dirty_bitmap
= NULL
;
1245 int kvm_physical_memory_set_dirty_tracking(int enable
)
1253 if (!kvm_dirty_bitmap
) {
1254 unsigned bitmap_size
= BITMAP_SIZE(phys_ram_size
);
1255 kvm_dirty_bitmap
= qemu_malloc(bitmap_size
);
1256 if (kvm_dirty_bitmap
== NULL
) {
1257 perror("Failed to allocate dirty pages bitmap");
1261 r
= kvm_dirty_pages_log_enable_all(kvm_context
);
1266 if (kvm_dirty_bitmap
) {
1267 r
= kvm_dirty_pages_log_reset(kvm_context
);
1268 qemu_free(kvm_dirty_bitmap
);
1269 kvm_dirty_bitmap
= NULL
;
1275 /* get kvm's dirty pages bitmap and update qemu's */
1276 int kvm_get_dirty_pages_log_slot(unsigned long start_addr
,
1277 unsigned char *bitmap
,
1278 unsigned int offset
,
1282 unsigned int i
, j
, n
=0;
1284 unsigned page_number
, addr
, addr1
;
1286 memset(bitmap
, 0, len
);
1287 r
= kvm_get_dirty_pages(kvm_context
, start_addr
, bitmap
);
1292 * bitmap-traveling is faster than memory-traveling (for addr...)
1293 * especially when most of the memory is not dirty.
1295 for (i
=0; i
<len
; i
++) {
1300 page_number
= i
* 8 + j
;
1301 addr1
= page_number
* TARGET_PAGE_SIZE
;
1302 addr
= offset
+ addr1
;
1303 cpu_physical_memory_set_dirty(addr
);
1311 * get kvm's dirty pages bitmap and update qemu's
1312 * we only care about physical ram, which resides in slots 0 and 3
1314 int kvm_update_dirty_pages_log(void)
1318 len
= BITMAP_SIZE(0xa0000);
1319 r
= kvm_get_dirty_pages_log_slot(0, kvm_dirty_bitmap
, 0 , len
);
1320 len
= BITMAP_SIZE(phys_ram_size
- 0xc0000);
1321 r
= r
|| kvm_get_dirty_pages_log_slot(0xc0000, kvm_dirty_bitmap
, 0xc0000, len
);
1325 int kvm_get_phys_ram_page_bitmap(unsigned char *bitmap
)
1327 int r
=0, len
, offset
;
1329 len
= BITMAP_SIZE(phys_ram_size
);
1330 memset(bitmap
, 0, len
);
1332 r
= kvm_get_mem_map(kvm_context
, 0, bitmap
);
1336 offset
= BITMAP_SIZE(0xc0000);
1337 r
= kvm_get_mem_map(kvm_context
, 0xc0000, bitmap
+ offset
);
1343 #ifdef KVM_CAP_IRQCHIP
1345 int kvm_set_irq(int irq
, int level
)
1347 return kvm_set_irq_level(kvm_context
, irq
, level
);