4 * Copyright (c) 2003-2008 Fabrice Bellard
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
25 /* Needed early for CONFIG_BSD etc. */
26 #include "qemu/osdep.h"
28 #include "monitor/monitor.h"
29 #include "qapi/qmp/qerror.h"
30 #include "qemu/error-report.h"
31 #include "sysemu/sysemu.h"
32 #include "exec/gdbstub.h"
33 #include "sysemu/dma.h"
34 #include "sysemu/kvm.h"
35 #include "qmp-commands.h"
37 #include "qemu/thread.h"
38 #include "sysemu/cpus.h"
39 #include "sysemu/qtest.h"
40 #include "qemu/main-loop.h"
41 #include "qemu/bitmap.h"
42 #include "qemu/seqlock.h"
43 #include "qapi-event.h"
45 #include "sysemu/replay.h"
48 #include "qemu/compatfd.h"
53 #include <sys/prctl.h>
56 #define PR_MCE_KILL 33
59 #ifndef PR_MCE_KILL_SET
60 #define PR_MCE_KILL_SET 1
63 #ifndef PR_MCE_KILL_EARLY
64 #define PR_MCE_KILL_EARLY 1
67 #endif /* CONFIG_LINUX */
69 static CPUState
*next_cpu
;
73 /* vcpu throttling controls */
74 static QEMUTimer
*throttle_timer
;
75 static unsigned int throttle_percentage
;
77 #define CPU_THROTTLE_PCT_MIN 1
78 #define CPU_THROTTLE_PCT_MAX 99
79 #define CPU_THROTTLE_TIMESLICE_NS 10000000
81 bool cpu_is_stopped(CPUState
*cpu
)
83 return cpu
->stopped
|| !runstate_is_running();
86 static bool cpu_thread_is_idle(CPUState
*cpu
)
88 if (cpu
->stop
|| cpu
->queued_work_first
) {
91 if (cpu_is_stopped(cpu
)) {
94 if (!cpu
->halted
|| cpu_has_work(cpu
) ||
95 kvm_halt_in_kernel()) {
101 static bool all_cpu_threads_idle(void)
106 if (!cpu_thread_is_idle(cpu
)) {
113 /***********************************************************/
114 /* guest cycle counter */
116 /* Protected by TimersState seqlock */
118 static bool icount_sleep
= true;
119 static int64_t vm_clock_warp_start
= -1;
120 /* Conversion factor from emulated instructions to virtual clock ticks. */
121 static int icount_time_shift
;
122 /* Arbitrarily pick 1MIPS as the minimum allowable speed. */
123 #define MAX_ICOUNT_SHIFT 10
125 static QEMUTimer
*icount_rt_timer
;
126 static QEMUTimer
*icount_vm_timer
;
127 static QEMUTimer
*icount_warp_timer
;
129 typedef struct TimersState
{
130 /* Protected by BQL. */
131 int64_t cpu_ticks_prev
;
132 int64_t cpu_ticks_offset
;
134 /* cpu_clock_offset can be read out of BQL, so protect it with
137 QemuSeqLock vm_clock_seqlock
;
138 int64_t cpu_clock_offset
;
139 int32_t cpu_ticks_enabled
;
142 /* Compensate for varying guest execution speed. */
143 int64_t qemu_icount_bias
;
144 /* Only written by TCG thread */
148 static TimersState timers_state
;
150 int64_t cpu_get_icount_raw(void)
153 CPUState
*cpu
= current_cpu
;
155 icount
= timers_state
.qemu_icount
;
157 if (!cpu
->can_do_io
) {
158 fprintf(stderr
, "Bad icount read\n");
161 icount
-= (cpu
->icount_decr
.u16
.low
+ cpu
->icount_extra
);
166 /* Return the virtual CPU time, based on the instruction counter. */
167 static int64_t cpu_get_icount_locked(void)
169 int64_t icount
= cpu_get_icount_raw();
170 return timers_state
.qemu_icount_bias
+ cpu_icount_to_ns(icount
);
173 int64_t cpu_get_icount(void)
179 start
= seqlock_read_begin(&timers_state
.vm_clock_seqlock
);
180 icount
= cpu_get_icount_locked();
181 } while (seqlock_read_retry(&timers_state
.vm_clock_seqlock
, start
));
186 int64_t cpu_icount_to_ns(int64_t icount
)
188 return icount
<< icount_time_shift
;
191 /* return the host CPU cycle counter and handle stop/restart */
192 /* Caller must hold the BQL */
193 int64_t cpu_get_ticks(void)
198 return cpu_get_icount();
201 ticks
= timers_state
.cpu_ticks_offset
;
202 if (timers_state
.cpu_ticks_enabled
) {
203 ticks
+= cpu_get_host_ticks();
206 if (timers_state
.cpu_ticks_prev
> ticks
) {
207 /* Note: non increasing ticks may happen if the host uses
209 timers_state
.cpu_ticks_offset
+= timers_state
.cpu_ticks_prev
- ticks
;
210 ticks
= timers_state
.cpu_ticks_prev
;
213 timers_state
.cpu_ticks_prev
= ticks
;
217 static int64_t cpu_get_clock_locked(void)
221 ticks
= timers_state
.cpu_clock_offset
;
222 if (timers_state
.cpu_ticks_enabled
) {
223 ticks
+= get_clock();
229 /* return the host CPU monotonic timer and handle stop/restart */
230 int64_t cpu_get_clock(void)
236 start
= seqlock_read_begin(&timers_state
.vm_clock_seqlock
);
237 ti
= cpu_get_clock_locked();
238 } while (seqlock_read_retry(&timers_state
.vm_clock_seqlock
, start
));
243 /* enable cpu_get_ticks()
244 * Caller must hold BQL which server as mutex for vm_clock_seqlock.
246 void cpu_enable_ticks(void)
248 /* Here, the really thing protected by seqlock is cpu_clock_offset. */
249 seqlock_write_lock(&timers_state
.vm_clock_seqlock
);
250 if (!timers_state
.cpu_ticks_enabled
) {
251 timers_state
.cpu_ticks_offset
-= cpu_get_host_ticks();
252 timers_state
.cpu_clock_offset
-= get_clock();
253 timers_state
.cpu_ticks_enabled
= 1;
255 seqlock_write_unlock(&timers_state
.vm_clock_seqlock
);
258 /* disable cpu_get_ticks() : the clock is stopped. You must not call
259 * cpu_get_ticks() after that.
260 * Caller must hold BQL which server as mutex for vm_clock_seqlock.
262 void cpu_disable_ticks(void)
264 /* Here, the really thing protected by seqlock is cpu_clock_offset. */
265 seqlock_write_lock(&timers_state
.vm_clock_seqlock
);
266 if (timers_state
.cpu_ticks_enabled
) {
267 timers_state
.cpu_ticks_offset
+= cpu_get_host_ticks();
268 timers_state
.cpu_clock_offset
= cpu_get_clock_locked();
269 timers_state
.cpu_ticks_enabled
= 0;
271 seqlock_write_unlock(&timers_state
.vm_clock_seqlock
);
274 /* Correlation between real and virtual time is always going to be
275 fairly approximate, so ignore small variation.
276 When the guest is idle real and virtual time will be aligned in
278 #define ICOUNT_WOBBLE (get_ticks_per_sec() / 10)
280 static void icount_adjust(void)
286 /* Protected by TimersState mutex. */
287 static int64_t last_delta
;
289 /* If the VM is not running, then do nothing. */
290 if (!runstate_is_running()) {
294 seqlock_write_lock(&timers_state
.vm_clock_seqlock
);
295 cur_time
= cpu_get_clock_locked();
296 cur_icount
= cpu_get_icount_locked();
298 delta
= cur_icount
- cur_time
;
299 /* FIXME: This is a very crude algorithm, somewhat prone to oscillation. */
301 && last_delta
+ ICOUNT_WOBBLE
< delta
* 2
302 && icount_time_shift
> 0) {
303 /* The guest is getting too far ahead. Slow time down. */
307 && last_delta
- ICOUNT_WOBBLE
> delta
* 2
308 && icount_time_shift
< MAX_ICOUNT_SHIFT
) {
309 /* The guest is getting too far behind. Speed time up. */
313 timers_state
.qemu_icount_bias
= cur_icount
314 - (timers_state
.qemu_icount
<< icount_time_shift
);
315 seqlock_write_unlock(&timers_state
.vm_clock_seqlock
);
318 static void icount_adjust_rt(void *opaque
)
320 timer_mod(icount_rt_timer
,
321 qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT
) + 1000);
325 static void icount_adjust_vm(void *opaque
)
327 timer_mod(icount_vm_timer
,
328 qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL
) +
329 get_ticks_per_sec() / 10);
333 static int64_t qemu_icount_round(int64_t count
)
335 return (count
+ (1 << icount_time_shift
) - 1) >> icount_time_shift
;
338 static void icount_warp_rt(void)
340 /* The icount_warp_timer is rescheduled soon after vm_clock_warp_start
341 * changes from -1 to another value, so the race here is okay.
343 if (atomic_read(&vm_clock_warp_start
) == -1) {
347 seqlock_write_lock(&timers_state
.vm_clock_seqlock
);
348 if (runstate_is_running()) {
349 int64_t clock
= REPLAY_CLOCK(REPLAY_CLOCK_VIRTUAL_RT
,
350 cpu_get_clock_locked());
353 warp_delta
= clock
- vm_clock_warp_start
;
354 if (use_icount
== 2) {
356 * In adaptive mode, do not let QEMU_CLOCK_VIRTUAL run too
357 * far ahead of real time.
359 int64_t cur_icount
= cpu_get_icount_locked();
360 int64_t delta
= clock
- cur_icount
;
361 warp_delta
= MIN(warp_delta
, delta
);
363 timers_state
.qemu_icount_bias
+= warp_delta
;
365 vm_clock_warp_start
= -1;
366 seqlock_write_unlock(&timers_state
.vm_clock_seqlock
);
368 if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL
)) {
369 qemu_clock_notify(QEMU_CLOCK_VIRTUAL
);
373 static void icount_timer_cb(void *opaque
)
375 /* No need for a checkpoint because the timer already synchronizes
376 * with CHECKPOINT_CLOCK_VIRTUAL_RT.
381 void qtest_clock_warp(int64_t dest
)
383 int64_t clock
= qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL
);
384 AioContext
*aio_context
;
385 assert(qtest_enabled());
386 aio_context
= qemu_get_aio_context();
387 while (clock
< dest
) {
388 int64_t deadline
= qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL
);
389 int64_t warp
= qemu_soonest_timeout(dest
- clock
, deadline
);
391 seqlock_write_lock(&timers_state
.vm_clock_seqlock
);
392 timers_state
.qemu_icount_bias
+= warp
;
393 seqlock_write_unlock(&timers_state
.vm_clock_seqlock
);
395 qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL
);
396 timerlist_run_timers(aio_context
->tlg
.tl
[QEMU_CLOCK_VIRTUAL
]);
397 clock
= qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL
);
399 qemu_clock_notify(QEMU_CLOCK_VIRTUAL
);
402 void qemu_start_warp_timer(void)
411 /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
412 * do not fire, so computing the deadline does not make sense.
414 if (!runstate_is_running()) {
418 /* warp clock deterministically in record/replay mode */
419 if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_START
)) {
423 if (!all_cpu_threads_idle()) {
427 if (qtest_enabled()) {
428 /* When testing, qtest commands advance icount. */
432 /* We want to use the earliest deadline from ALL vm_clocks */
433 clock
= qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT
);
434 deadline
= qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL
);
436 static bool notified
;
437 if (!icount_sleep
&& !notified
) {
438 error_report("WARNING: icount sleep disabled and no active timers");
446 * Ensure QEMU_CLOCK_VIRTUAL proceeds even when the virtual CPU goes to
447 * sleep. Otherwise, the CPU might be waiting for a future timer
448 * interrupt to wake it up, but the interrupt never comes because
449 * the vCPU isn't running any insns and thus doesn't advance the
450 * QEMU_CLOCK_VIRTUAL.
454 * We never let VCPUs sleep in no sleep icount mode.
455 * If there is a pending QEMU_CLOCK_VIRTUAL timer we just advance
456 * to the next QEMU_CLOCK_VIRTUAL event and notify it.
457 * It is useful when we want a deterministic execution time,
458 * isolated from host latencies.
460 seqlock_write_lock(&timers_state
.vm_clock_seqlock
);
461 timers_state
.qemu_icount_bias
+= deadline
;
462 seqlock_write_unlock(&timers_state
.vm_clock_seqlock
);
463 qemu_clock_notify(QEMU_CLOCK_VIRTUAL
);
466 * We do stop VCPUs and only advance QEMU_CLOCK_VIRTUAL after some
467 * "real" time, (related to the time left until the next event) has
468 * passed. The QEMU_CLOCK_VIRTUAL_RT clock will do this.
469 * This avoids that the warps are visible externally; for example,
470 * you will not be sending network packets continuously instead of
473 seqlock_write_lock(&timers_state
.vm_clock_seqlock
);
474 if (vm_clock_warp_start
== -1 || vm_clock_warp_start
> clock
) {
475 vm_clock_warp_start
= clock
;
477 seqlock_write_unlock(&timers_state
.vm_clock_seqlock
);
478 timer_mod_anticipate(icount_warp_timer
, clock
+ deadline
);
480 } else if (deadline
== 0) {
481 qemu_clock_notify(QEMU_CLOCK_VIRTUAL
);
485 static void qemu_account_warp_timer(void)
487 if (!use_icount
|| !icount_sleep
) {
491 /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
492 * do not fire, so computing the deadline does not make sense.
494 if (!runstate_is_running()) {
498 /* warp clock deterministically in record/replay mode */
499 if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_ACCOUNT
)) {
503 timer_del(icount_warp_timer
);
507 static bool icount_state_needed(void *opaque
)
513 * This is a subsection for icount migration.
515 static const VMStateDescription icount_vmstate_timers
= {
516 .name
= "timer/icount",
518 .minimum_version_id
= 1,
519 .needed
= icount_state_needed
,
520 .fields
= (VMStateField
[]) {
521 VMSTATE_INT64(qemu_icount_bias
, TimersState
),
522 VMSTATE_INT64(qemu_icount
, TimersState
),
523 VMSTATE_END_OF_LIST()
527 static const VMStateDescription vmstate_timers
= {
530 .minimum_version_id
= 1,
531 .fields
= (VMStateField
[]) {
532 VMSTATE_INT64(cpu_ticks_offset
, TimersState
),
533 VMSTATE_INT64(dummy
, TimersState
),
534 VMSTATE_INT64_V(cpu_clock_offset
, TimersState
, 2),
535 VMSTATE_END_OF_LIST()
537 .subsections
= (const VMStateDescription
*[]) {
538 &icount_vmstate_timers
,
543 static void cpu_throttle_thread(void *opaque
)
545 CPUState
*cpu
= opaque
;
547 double throttle_ratio
;
550 if (!cpu_throttle_get_percentage()) {
554 pct
= (double)cpu_throttle_get_percentage()/100;
555 throttle_ratio
= pct
/ (1 - pct
);
556 sleeptime_ns
= (long)(throttle_ratio
* CPU_THROTTLE_TIMESLICE_NS
);
558 qemu_mutex_unlock_iothread();
559 atomic_set(&cpu
->throttle_thread_scheduled
, 0);
560 g_usleep(sleeptime_ns
/ 1000); /* Convert ns to us for usleep call */
561 qemu_mutex_lock_iothread();
564 static void cpu_throttle_timer_tick(void *opaque
)
569 /* Stop the timer if needed */
570 if (!cpu_throttle_get_percentage()) {
574 if (!atomic_xchg(&cpu
->throttle_thread_scheduled
, 1)) {
575 async_run_on_cpu(cpu
, cpu_throttle_thread
, cpu
);
579 pct
= (double)cpu_throttle_get_percentage()/100;
580 timer_mod(throttle_timer
, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT
) +
581 CPU_THROTTLE_TIMESLICE_NS
/ (1-pct
));
584 void cpu_throttle_set(int new_throttle_pct
)
586 /* Ensure throttle percentage is within valid range */
587 new_throttle_pct
= MIN(new_throttle_pct
, CPU_THROTTLE_PCT_MAX
);
588 new_throttle_pct
= MAX(new_throttle_pct
, CPU_THROTTLE_PCT_MIN
);
590 atomic_set(&throttle_percentage
, new_throttle_pct
);
592 timer_mod(throttle_timer
, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT
) +
593 CPU_THROTTLE_TIMESLICE_NS
);
596 void cpu_throttle_stop(void)
598 atomic_set(&throttle_percentage
, 0);
601 bool cpu_throttle_active(void)
603 return (cpu_throttle_get_percentage() != 0);
606 int cpu_throttle_get_percentage(void)
608 return atomic_read(&throttle_percentage
);
611 void cpu_ticks_init(void)
613 seqlock_init(&timers_state
.vm_clock_seqlock
, NULL
);
614 vmstate_register(NULL
, 0, &vmstate_timers
, &timers_state
);
615 throttle_timer
= timer_new_ns(QEMU_CLOCK_VIRTUAL_RT
,
616 cpu_throttle_timer_tick
, NULL
);
619 void configure_icount(QemuOpts
*opts
, Error
**errp
)
622 char *rem_str
= NULL
;
624 option
= qemu_opt_get(opts
, "shift");
626 if (qemu_opt_get(opts
, "align") != NULL
) {
627 error_setg(errp
, "Please specify shift option when using align");
632 icount_sleep
= qemu_opt_get_bool(opts
, "sleep", true);
634 icount_warp_timer
= timer_new_ns(QEMU_CLOCK_VIRTUAL_RT
,
635 icount_timer_cb
, NULL
);
638 icount_align_option
= qemu_opt_get_bool(opts
, "align", false);
640 if (icount_align_option
&& !icount_sleep
) {
641 error_setg(errp
, "align=on and sleep=off are incompatible");
643 if (strcmp(option
, "auto") != 0) {
645 icount_time_shift
= strtol(option
, &rem_str
, 0);
646 if (errno
!= 0 || *rem_str
!= '\0' || !strlen(option
)) {
647 error_setg(errp
, "icount: Invalid shift value");
651 } else if (icount_align_option
) {
652 error_setg(errp
, "shift=auto and align=on are incompatible");
653 } else if (!icount_sleep
) {
654 error_setg(errp
, "shift=auto and sleep=off are incompatible");
659 /* 125MIPS seems a reasonable initial guess at the guest speed.
660 It will be corrected fairly quickly anyway. */
661 icount_time_shift
= 3;
663 /* Have both realtime and virtual time triggers for speed adjustment.
664 The realtime trigger catches emulated time passing too slowly,
665 the virtual time trigger catches emulated time passing too fast.
666 Realtime triggers occur even when idle, so use them less frequently
668 icount_rt_timer
= timer_new_ms(QEMU_CLOCK_VIRTUAL_RT
,
669 icount_adjust_rt
, NULL
);
670 timer_mod(icount_rt_timer
,
671 qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT
) + 1000);
672 icount_vm_timer
= timer_new_ns(QEMU_CLOCK_VIRTUAL
,
673 icount_adjust_vm
, NULL
);
674 timer_mod(icount_vm_timer
,
675 qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL
) +
676 get_ticks_per_sec() / 10);
679 /***********************************************************/
680 void hw_error(const char *fmt
, ...)
686 fprintf(stderr
, "qemu: hardware error: ");
687 vfprintf(stderr
, fmt
, ap
);
688 fprintf(stderr
, "\n");
690 fprintf(stderr
, "CPU #%d:\n", cpu
->cpu_index
);
691 cpu_dump_state(cpu
, stderr
, fprintf
, CPU_DUMP_FPU
);
697 void cpu_synchronize_all_states(void)
702 cpu_synchronize_state(cpu
);
706 void cpu_synchronize_all_post_reset(void)
711 cpu_synchronize_post_reset(cpu
);
715 void cpu_synchronize_all_post_init(void)
720 cpu_synchronize_post_init(cpu
);
724 static int do_vm_stop(RunState state
)
728 if (runstate_is_running()) {
732 vm_state_notify(0, state
);
733 qapi_event_send_stop(&error_abort
);
737 ret
= bdrv_flush_all();
742 static bool cpu_can_run(CPUState
*cpu
)
747 if (cpu_is_stopped(cpu
)) {
753 static void cpu_handle_guest_debug(CPUState
*cpu
)
755 gdb_set_stop_cpu(cpu
);
756 qemu_system_debug_request();
761 static void sigbus_reraise(void)
764 struct sigaction action
;
766 memset(&action
, 0, sizeof(action
));
767 action
.sa_handler
= SIG_DFL
;
768 if (!sigaction(SIGBUS
, &action
, NULL
)) {
771 sigaddset(&set
, SIGBUS
);
772 sigprocmask(SIG_UNBLOCK
, &set
, NULL
);
774 perror("Failed to re-raise SIGBUS!\n");
778 static void sigbus_handler(int n
, struct qemu_signalfd_siginfo
*siginfo
,
781 if (kvm_on_sigbus(siginfo
->ssi_code
,
782 (void *)(intptr_t)siginfo
->ssi_addr
)) {
787 static void qemu_init_sigbus(void)
789 struct sigaction action
;
791 memset(&action
, 0, sizeof(action
));
792 action
.sa_flags
= SA_SIGINFO
;
793 action
.sa_sigaction
= (void (*)(int, siginfo_t
*, void*))sigbus_handler
;
794 sigaction(SIGBUS
, &action
, NULL
);
796 prctl(PR_MCE_KILL
, PR_MCE_KILL_SET
, PR_MCE_KILL_EARLY
, 0, 0);
799 static void qemu_kvm_eat_signals(CPUState
*cpu
)
801 struct timespec ts
= { 0, 0 };
807 sigemptyset(&waitset
);
808 sigaddset(&waitset
, SIG_IPI
);
809 sigaddset(&waitset
, SIGBUS
);
812 r
= sigtimedwait(&waitset
, &siginfo
, &ts
);
813 if (r
== -1 && !(errno
== EAGAIN
|| errno
== EINTR
)) {
814 perror("sigtimedwait");
820 if (kvm_on_sigbus_vcpu(cpu
, siginfo
.si_code
, siginfo
.si_addr
)) {
828 r
= sigpending(&chkset
);
830 perror("sigpending");
833 } while (sigismember(&chkset
, SIG_IPI
) || sigismember(&chkset
, SIGBUS
));
836 #else /* !CONFIG_LINUX */
838 static void qemu_init_sigbus(void)
842 static void qemu_kvm_eat_signals(CPUState
*cpu
)
845 #endif /* !CONFIG_LINUX */
848 static void dummy_signal(int sig
)
852 static void qemu_kvm_init_cpu_signals(CPUState
*cpu
)
856 struct sigaction sigact
;
858 memset(&sigact
, 0, sizeof(sigact
));
859 sigact
.sa_handler
= dummy_signal
;
860 sigaction(SIG_IPI
, &sigact
, NULL
);
862 pthread_sigmask(SIG_BLOCK
, NULL
, &set
);
863 sigdelset(&set
, SIG_IPI
);
864 sigdelset(&set
, SIGBUS
);
865 r
= kvm_set_signal_mask(cpu
, &set
);
867 fprintf(stderr
, "kvm_set_signal_mask: %s\n", strerror(-r
));
873 static void qemu_kvm_init_cpu_signals(CPUState
*cpu
)
879 static QemuMutex qemu_global_mutex
;
880 static QemuCond qemu_io_proceeded_cond
;
881 static unsigned iothread_requesting_mutex
;
883 static QemuThread io_thread
;
886 static QemuCond qemu_cpu_cond
;
888 static QemuCond qemu_pause_cond
;
889 static QemuCond qemu_work_cond
;
891 void qemu_init_cpu_loop(void)
894 qemu_cond_init(&qemu_cpu_cond
);
895 qemu_cond_init(&qemu_pause_cond
);
896 qemu_cond_init(&qemu_work_cond
);
897 qemu_cond_init(&qemu_io_proceeded_cond
);
898 qemu_mutex_init(&qemu_global_mutex
);
900 qemu_thread_get_self(&io_thread
);
903 void run_on_cpu(CPUState
*cpu
, void (*func
)(void *data
), void *data
)
905 struct qemu_work_item wi
;
907 if (qemu_cpu_is_self(cpu
)) {
916 qemu_mutex_lock(&cpu
->work_mutex
);
917 if (cpu
->queued_work_first
== NULL
) {
918 cpu
->queued_work_first
= &wi
;
920 cpu
->queued_work_last
->next
= &wi
;
922 cpu
->queued_work_last
= &wi
;
925 qemu_mutex_unlock(&cpu
->work_mutex
);
928 while (!atomic_mb_read(&wi
.done
)) {
929 CPUState
*self_cpu
= current_cpu
;
931 qemu_cond_wait(&qemu_work_cond
, &qemu_global_mutex
);
932 current_cpu
= self_cpu
;
936 void async_run_on_cpu(CPUState
*cpu
, void (*func
)(void *data
), void *data
)
938 struct qemu_work_item
*wi
;
940 if (qemu_cpu_is_self(cpu
)) {
945 wi
= g_malloc0(sizeof(struct qemu_work_item
));
950 qemu_mutex_lock(&cpu
->work_mutex
);
951 if (cpu
->queued_work_first
== NULL
) {
952 cpu
->queued_work_first
= wi
;
954 cpu
->queued_work_last
->next
= wi
;
956 cpu
->queued_work_last
= wi
;
959 qemu_mutex_unlock(&cpu
->work_mutex
);
964 static void flush_queued_work(CPUState
*cpu
)
966 struct qemu_work_item
*wi
;
968 if (cpu
->queued_work_first
== NULL
) {
972 qemu_mutex_lock(&cpu
->work_mutex
);
973 while (cpu
->queued_work_first
!= NULL
) {
974 wi
= cpu
->queued_work_first
;
975 cpu
->queued_work_first
= wi
->next
;
976 if (!cpu
->queued_work_first
) {
977 cpu
->queued_work_last
= NULL
;
979 qemu_mutex_unlock(&cpu
->work_mutex
);
981 qemu_mutex_lock(&cpu
->work_mutex
);
985 atomic_mb_set(&wi
->done
, true);
988 qemu_mutex_unlock(&cpu
->work_mutex
);
989 qemu_cond_broadcast(&qemu_work_cond
);
992 static void qemu_wait_io_event_common(CPUState
*cpu
)
997 qemu_cond_broadcast(&qemu_pause_cond
);
999 flush_queued_work(cpu
);
1000 cpu
->thread_kicked
= false;
1003 static void qemu_tcg_wait_io_event(CPUState
*cpu
)
1005 while (all_cpu_threads_idle()) {
1006 qemu_cond_wait(cpu
->halt_cond
, &qemu_global_mutex
);
1009 while (iothread_requesting_mutex
) {
1010 qemu_cond_wait(&qemu_io_proceeded_cond
, &qemu_global_mutex
);
1014 qemu_wait_io_event_common(cpu
);
1018 static void qemu_kvm_wait_io_event(CPUState
*cpu
)
1020 while (cpu_thread_is_idle(cpu
)) {
1021 qemu_cond_wait(cpu
->halt_cond
, &qemu_global_mutex
);
1024 qemu_kvm_eat_signals(cpu
);
1025 qemu_wait_io_event_common(cpu
);
1028 static void *qemu_kvm_cpu_thread_fn(void *arg
)
1030 CPUState
*cpu
= arg
;
1033 rcu_register_thread();
1035 qemu_mutex_lock_iothread();
1036 qemu_thread_get_self(cpu
->thread
);
1037 cpu
->thread_id
= qemu_get_thread_id();
1041 r
= kvm_init_vcpu(cpu
);
1043 fprintf(stderr
, "kvm_init_vcpu failed: %s\n", strerror(-r
));
1047 qemu_kvm_init_cpu_signals(cpu
);
1049 /* signal CPU creation */
1050 cpu
->created
= true;
1051 qemu_cond_signal(&qemu_cpu_cond
);
1054 if (cpu_can_run(cpu
)) {
1055 r
= kvm_cpu_exec(cpu
);
1056 if (r
== EXCP_DEBUG
) {
1057 cpu_handle_guest_debug(cpu
);
1060 qemu_kvm_wait_io_event(cpu
);
1066 static void *qemu_dummy_cpu_thread_fn(void *arg
)
1069 fprintf(stderr
, "qtest is not supported under Windows\n");
1072 CPUState
*cpu
= arg
;
1076 rcu_register_thread();
1078 qemu_mutex_lock_iothread();
1079 qemu_thread_get_self(cpu
->thread
);
1080 cpu
->thread_id
= qemu_get_thread_id();
1083 sigemptyset(&waitset
);
1084 sigaddset(&waitset
, SIG_IPI
);
1086 /* signal CPU creation */
1087 cpu
->created
= true;
1088 qemu_cond_signal(&qemu_cpu_cond
);
1093 qemu_mutex_unlock_iothread();
1096 r
= sigwait(&waitset
, &sig
);
1097 } while (r
== -1 && (errno
== EAGAIN
|| errno
== EINTR
));
1102 qemu_mutex_lock_iothread();
1104 qemu_wait_io_event_common(cpu
);
1111 static void tcg_exec_all(void);
1113 static void *qemu_tcg_cpu_thread_fn(void *arg
)
1115 CPUState
*cpu
= arg
;
1117 rcu_register_thread();
1119 qemu_mutex_lock_iothread();
1120 qemu_thread_get_self(cpu
->thread
);
1123 cpu
->thread_id
= qemu_get_thread_id();
1124 cpu
->created
= true;
1127 qemu_cond_signal(&qemu_cpu_cond
);
1129 /* wait for initial kick-off after machine start */
1130 while (first_cpu
->stopped
) {
1131 qemu_cond_wait(first_cpu
->halt_cond
, &qemu_global_mutex
);
1133 /* process any pending work */
1135 qemu_wait_io_event_common(cpu
);
1139 /* process any pending work */
1140 atomic_mb_set(&exit_request
, 1);
1146 int64_t deadline
= qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL
);
1148 if (deadline
== 0) {
1149 qemu_clock_notify(QEMU_CLOCK_VIRTUAL
);
1152 qemu_tcg_wait_io_event(QTAILQ_FIRST(&cpus
));
1158 static void qemu_cpu_kick_thread(CPUState
*cpu
)
1163 if (cpu
->thread_kicked
) {
1166 cpu
->thread_kicked
= true;
1167 err
= pthread_kill(cpu
->thread
->thread
, SIG_IPI
);
1169 fprintf(stderr
, "qemu:%s: %s", __func__
, strerror(err
));
1177 static void qemu_cpu_kick_no_halt(void)
1180 /* Ensure whatever caused the exit has reached the CPU threads before
1181 * writing exit_request.
1183 atomic_mb_set(&exit_request
, 1);
1184 cpu
= atomic_mb_read(&tcg_current_cpu
);
1190 void qemu_cpu_kick(CPUState
*cpu
)
1192 qemu_cond_broadcast(cpu
->halt_cond
);
1193 if (tcg_enabled()) {
1194 qemu_cpu_kick_no_halt();
1196 qemu_cpu_kick_thread(cpu
);
1200 void qemu_cpu_kick_self(void)
1202 assert(current_cpu
);
1203 qemu_cpu_kick_thread(current_cpu
);
1206 bool qemu_cpu_is_self(CPUState
*cpu
)
1208 return qemu_thread_is_self(cpu
->thread
);
1211 bool qemu_in_vcpu_thread(void)
1213 return current_cpu
&& qemu_cpu_is_self(current_cpu
);
1216 static __thread
bool iothread_locked
= false;
1218 bool qemu_mutex_iothread_locked(void)
1220 return iothread_locked
;
1223 void qemu_mutex_lock_iothread(void)
1225 atomic_inc(&iothread_requesting_mutex
);
1226 /* In the simple case there is no need to bump the VCPU thread out of
1227 * TCG code execution.
1229 if (!tcg_enabled() || qemu_in_vcpu_thread() ||
1230 !first_cpu
|| !first_cpu
->created
) {
1231 qemu_mutex_lock(&qemu_global_mutex
);
1232 atomic_dec(&iothread_requesting_mutex
);
1234 if (qemu_mutex_trylock(&qemu_global_mutex
)) {
1235 qemu_cpu_kick_no_halt();
1236 qemu_mutex_lock(&qemu_global_mutex
);
1238 atomic_dec(&iothread_requesting_mutex
);
1239 qemu_cond_broadcast(&qemu_io_proceeded_cond
);
1241 iothread_locked
= true;
1244 void qemu_mutex_unlock_iothread(void)
1246 iothread_locked
= false;
1247 qemu_mutex_unlock(&qemu_global_mutex
);
1250 static int all_vcpus_paused(void)
1255 if (!cpu
->stopped
) {
1263 void pause_all_vcpus(void)
1267 qemu_clock_enable(QEMU_CLOCK_VIRTUAL
, false);
1273 if (qemu_in_vcpu_thread()) {
1275 if (!kvm_enabled()) {
1278 cpu
->stopped
= true;
1284 while (!all_vcpus_paused()) {
1285 qemu_cond_wait(&qemu_pause_cond
, &qemu_global_mutex
);
1292 void cpu_resume(CPUState
*cpu
)
1295 cpu
->stopped
= false;
1299 void resume_all_vcpus(void)
1303 qemu_clock_enable(QEMU_CLOCK_VIRTUAL
, true);
1309 /* For temporary buffers for forming a name */
1310 #define VCPU_THREAD_NAME_SIZE 16
1312 static void qemu_tcg_init_vcpu(CPUState
*cpu
)
1314 char thread_name
[VCPU_THREAD_NAME_SIZE
];
1315 static QemuCond
*tcg_halt_cond
;
1316 static QemuThread
*tcg_cpu_thread
;
1318 /* share a single thread for all cpus with TCG */
1319 if (!tcg_cpu_thread
) {
1320 cpu
->thread
= g_malloc0(sizeof(QemuThread
));
1321 cpu
->halt_cond
= g_malloc0(sizeof(QemuCond
));
1322 qemu_cond_init(cpu
->halt_cond
);
1323 tcg_halt_cond
= cpu
->halt_cond
;
1324 snprintf(thread_name
, VCPU_THREAD_NAME_SIZE
, "CPU %d/TCG",
1326 qemu_thread_create(cpu
->thread
, thread_name
, qemu_tcg_cpu_thread_fn
,
1327 cpu
, QEMU_THREAD_JOINABLE
);
1329 cpu
->hThread
= qemu_thread_get_handle(cpu
->thread
);
1331 while (!cpu
->created
) {
1332 qemu_cond_wait(&qemu_cpu_cond
, &qemu_global_mutex
);
1334 tcg_cpu_thread
= cpu
->thread
;
1336 cpu
->thread
= tcg_cpu_thread
;
1337 cpu
->halt_cond
= tcg_halt_cond
;
1341 static void qemu_kvm_start_vcpu(CPUState
*cpu
)
1343 char thread_name
[VCPU_THREAD_NAME_SIZE
];
1345 cpu
->thread
= g_malloc0(sizeof(QemuThread
));
1346 cpu
->halt_cond
= g_malloc0(sizeof(QemuCond
));
1347 qemu_cond_init(cpu
->halt_cond
);
1348 snprintf(thread_name
, VCPU_THREAD_NAME_SIZE
, "CPU %d/KVM",
1350 qemu_thread_create(cpu
->thread
, thread_name
, qemu_kvm_cpu_thread_fn
,
1351 cpu
, QEMU_THREAD_JOINABLE
);
1352 while (!cpu
->created
) {
1353 qemu_cond_wait(&qemu_cpu_cond
, &qemu_global_mutex
);
1357 static void qemu_dummy_start_vcpu(CPUState
*cpu
)
1359 char thread_name
[VCPU_THREAD_NAME_SIZE
];
1361 cpu
->thread
= g_malloc0(sizeof(QemuThread
));
1362 cpu
->halt_cond
= g_malloc0(sizeof(QemuCond
));
1363 qemu_cond_init(cpu
->halt_cond
);
1364 snprintf(thread_name
, VCPU_THREAD_NAME_SIZE
, "CPU %d/DUMMY",
1366 qemu_thread_create(cpu
->thread
, thread_name
, qemu_dummy_cpu_thread_fn
, cpu
,
1367 QEMU_THREAD_JOINABLE
);
1368 while (!cpu
->created
) {
1369 qemu_cond_wait(&qemu_cpu_cond
, &qemu_global_mutex
);
1373 void qemu_init_vcpu(CPUState
*cpu
)
1375 cpu
->nr_cores
= smp_cores
;
1376 cpu
->nr_threads
= smp_threads
;
1377 cpu
->stopped
= true;
1380 /* If the target cpu hasn't set up any address spaces itself,
1381 * give it the default one.
1383 AddressSpace
*as
= address_space_init_shareable(cpu
->memory
,
1386 cpu_address_space_init(cpu
, as
, 0);
1389 if (kvm_enabled()) {
1390 qemu_kvm_start_vcpu(cpu
);
1391 } else if (tcg_enabled()) {
1392 qemu_tcg_init_vcpu(cpu
);
1394 qemu_dummy_start_vcpu(cpu
);
1398 void cpu_stop_current(void)
1401 current_cpu
->stop
= false;
1402 current_cpu
->stopped
= true;
1403 cpu_exit(current_cpu
);
1404 qemu_cond_broadcast(&qemu_pause_cond
);
1408 int vm_stop(RunState state
)
1410 if (qemu_in_vcpu_thread()) {
1411 qemu_system_vmstop_request_prepare();
1412 qemu_system_vmstop_request(state
);
1414 * FIXME: should not return to device code in case
1415 * vm_stop() has been requested.
1421 return do_vm_stop(state
);
1424 /* does a state transition even if the VM is already stopped,
1425 current state is forgotten forever */
1426 int vm_stop_force_state(RunState state
)
1428 if (runstate_is_running()) {
1429 return vm_stop(state
);
1431 runstate_set(state
);
1434 /* Make sure to return an error if the flush in a previous vm_stop()
1436 return bdrv_flush_all();
1440 static int64_t tcg_get_icount_limit(void)
1444 if (replay_mode
!= REPLAY_MODE_PLAY
) {
1445 deadline
= qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL
);
1447 /* Maintain prior (possibly buggy) behaviour where if no deadline
1448 * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
1449 * INT32_MAX nanoseconds ahead, we still use INT32_MAX
1452 if ((deadline
< 0) || (deadline
> INT32_MAX
)) {
1453 deadline
= INT32_MAX
;
1456 return qemu_icount_round(deadline
);
1458 return replay_get_instructions();
1462 static int tcg_cpu_exec(CPUState
*cpu
)
1465 #ifdef CONFIG_PROFILER
1469 #ifdef CONFIG_PROFILER
1470 ti
= profile_getclock();
1475 timers_state
.qemu_icount
-= (cpu
->icount_decr
.u16
.low
1476 + cpu
->icount_extra
);
1477 cpu
->icount_decr
.u16
.low
= 0;
1478 cpu
->icount_extra
= 0;
1479 count
= tcg_get_icount_limit();
1480 timers_state
.qemu_icount
+= count
;
1481 decr
= (count
> 0xffff) ? 0xffff : count
;
1483 cpu
->icount_decr
.u16
.low
= decr
;
1484 cpu
->icount_extra
= count
;
1486 ret
= cpu_exec(cpu
);
1487 #ifdef CONFIG_PROFILER
1488 tcg_time
+= profile_getclock() - ti
;
1491 /* Fold pending instructions back into the
1492 instruction counter, and clear the interrupt flag. */
1493 timers_state
.qemu_icount
-= (cpu
->icount_decr
.u16
.low
1494 + cpu
->icount_extra
);
1495 cpu
->icount_decr
.u32
= 0;
1496 cpu
->icount_extra
= 0;
1497 replay_account_executed_instructions();
1502 static void tcg_exec_all(void)
1506 /* Account partial waits to QEMU_CLOCK_VIRTUAL. */
1507 qemu_account_warp_timer();
1509 if (next_cpu
== NULL
) {
1510 next_cpu
= first_cpu
;
1512 for (; next_cpu
!= NULL
&& !exit_request
; next_cpu
= CPU_NEXT(next_cpu
)) {
1513 CPUState
*cpu
= next_cpu
;
1515 qemu_clock_enable(QEMU_CLOCK_VIRTUAL
,
1516 (cpu
->singlestep_enabled
& SSTEP_NOTIMER
) == 0);
1518 if (cpu_can_run(cpu
)) {
1519 r
= tcg_cpu_exec(cpu
);
1520 if (r
== EXCP_DEBUG
) {
1521 cpu_handle_guest_debug(cpu
);
1524 } else if (cpu
->stop
|| cpu
->stopped
) {
1529 /* Pairs with smp_wmb in qemu_cpu_kick. */
1530 atomic_mb_set(&exit_request
, 0);
1533 void list_cpus(FILE *f
, fprintf_function cpu_fprintf
, const char *optarg
)
1535 /* XXX: implement xxx_cpu_list for targets that still miss it */
1536 #if defined(cpu_list)
1537 cpu_list(f
, cpu_fprintf
);
1541 CpuInfoList
*qmp_query_cpus(Error
**errp
)
1543 CpuInfoList
*head
= NULL
, *cur_item
= NULL
;
1548 #if defined(TARGET_I386)
1549 X86CPU
*x86_cpu
= X86_CPU(cpu
);
1550 CPUX86State
*env
= &x86_cpu
->env
;
1551 #elif defined(TARGET_PPC)
1552 PowerPCCPU
*ppc_cpu
= POWERPC_CPU(cpu
);
1553 CPUPPCState
*env
= &ppc_cpu
->env
;
1554 #elif defined(TARGET_SPARC)
1555 SPARCCPU
*sparc_cpu
= SPARC_CPU(cpu
);
1556 CPUSPARCState
*env
= &sparc_cpu
->env
;
1557 #elif defined(TARGET_MIPS)
1558 MIPSCPU
*mips_cpu
= MIPS_CPU(cpu
);
1559 CPUMIPSState
*env
= &mips_cpu
->env
;
1560 #elif defined(TARGET_TRICORE)
1561 TriCoreCPU
*tricore_cpu
= TRICORE_CPU(cpu
);
1562 CPUTriCoreState
*env
= &tricore_cpu
->env
;
1565 cpu_synchronize_state(cpu
);
1567 info
= g_malloc0(sizeof(*info
));
1568 info
->value
= g_malloc0(sizeof(*info
->value
));
1569 info
->value
->CPU
= cpu
->cpu_index
;
1570 info
->value
->current
= (cpu
== first_cpu
);
1571 info
->value
->halted
= cpu
->halted
;
1572 info
->value
->qom_path
= object_get_canonical_path(OBJECT(cpu
));
1573 info
->value
->thread_id
= cpu
->thread_id
;
1574 #if defined(TARGET_I386)
1575 info
->value
->arch
= CPU_INFO_ARCH_X86
;
1576 info
->value
->u
.x86
.pc
= env
->eip
+ env
->segs
[R_CS
].base
;
1577 #elif defined(TARGET_PPC)
1578 info
->value
->arch
= CPU_INFO_ARCH_PPC
;
1579 info
->value
->u
.ppc
.nip
= env
->nip
;
1580 #elif defined(TARGET_SPARC)
1581 info
->value
->arch
= CPU_INFO_ARCH_SPARC
;
1582 info
->value
->u
.q_sparc
.pc
= env
->pc
;
1583 info
->value
->u
.q_sparc
.npc
= env
->npc
;
1584 #elif defined(TARGET_MIPS)
1585 info
->value
->arch
= CPU_INFO_ARCH_MIPS
;
1586 info
->value
->u
.q_mips
.PC
= env
->active_tc
.PC
;
1587 #elif defined(TARGET_TRICORE)
1588 info
->value
->arch
= CPU_INFO_ARCH_TRICORE
;
1589 info
->value
->u
.tricore
.PC
= env
->PC
;
1591 info
->value
->arch
= CPU_INFO_ARCH_OTHER
;
1594 /* XXX: waiting for the qapi to support GSList */
1596 head
= cur_item
= info
;
1598 cur_item
->next
= info
;
1606 void qmp_memsave(int64_t addr
, int64_t size
, const char *filename
,
1607 bool has_cpu
, int64_t cpu_index
, Error
**errp
)
1613 int64_t orig_addr
= addr
, orig_size
= size
;
1619 cpu
= qemu_get_cpu(cpu_index
);
1621 error_setg(errp
, QERR_INVALID_PARAMETER_VALUE
, "cpu-index",
1626 f
= fopen(filename
, "wb");
1628 error_setg_file_open(errp
, errno
, filename
);
1636 if (cpu_memory_rw_debug(cpu
, addr
, buf
, l
, 0) != 0) {
1637 error_setg(errp
, "Invalid addr 0x%016" PRIx64
"/size %" PRId64
1638 " specified", orig_addr
, orig_size
);
1641 if (fwrite(buf
, 1, l
, f
) != l
) {
1642 error_setg(errp
, QERR_IO_ERROR
);
1653 void qmp_pmemsave(int64_t addr
, int64_t size
, const char *filename
,
1660 f
= fopen(filename
, "wb");
1662 error_setg_file_open(errp
, errno
, filename
);
1670 cpu_physical_memory_read(addr
, buf
, l
);
1671 if (fwrite(buf
, 1, l
, f
) != l
) {
1672 error_setg(errp
, QERR_IO_ERROR
);
1683 void qmp_inject_nmi(Error
**errp
)
1685 #if defined(TARGET_I386)
1689 X86CPU
*cpu
= X86_CPU(cs
);
1691 if (!cpu
->apic_state
) {
1692 cpu_interrupt(cs
, CPU_INTERRUPT_NMI
);
1694 apic_deliver_nmi(cpu
->apic_state
);
1698 nmi_monitor_handle(monitor_get_cpu_index(), errp
);
1702 void dump_drift_info(FILE *f
, fprintf_function cpu_fprintf
)
1708 cpu_fprintf(f
, "Host - Guest clock %"PRIi64
" ms\n",
1709 (cpu_get_clock() - cpu_get_icount())/SCALE_MS
);
1710 if (icount_align_option
) {
1711 cpu_fprintf(f
, "Max guest delay %"PRIi64
" ms\n", -max_delay
/SCALE_MS
);
1712 cpu_fprintf(f
, "Max guest advance %"PRIi64
" ms\n", max_advance
/SCALE_MS
);
1714 cpu_fprintf(f
, "Max guest delay NA\n");
1715 cpu_fprintf(f
, "Max guest advance NA\n");