4 * Copyright (c) 2003-2008 Fabrice Bellard
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
25 /* Needed early for CONFIG_BSD etc. */
26 #include "qemu/osdep.h"
28 #include "monitor/monitor.h"
29 #include "qapi/qmp/qerror.h"
30 #include "qemu/error-report.h"
31 #include "sysemu/sysemu.h"
32 #include "sysemu/block-backend.h"
33 #include "exec/gdbstub.h"
34 #include "sysemu/dma.h"
35 #include "sysemu/kvm.h"
36 #include "qmp-commands.h"
38 #include "qemu/thread.h"
39 #include "sysemu/cpus.h"
40 #include "sysemu/qtest.h"
41 #include "qemu/main-loop.h"
42 #include "qemu/bitmap.h"
43 #include "qemu/seqlock.h"
44 #include "qapi-event.h"
46 #include "sysemu/replay.h"
49 #include "qemu/compatfd.h"
54 #include <sys/prctl.h>
57 #define PR_MCE_KILL 33
60 #ifndef PR_MCE_KILL_SET
61 #define PR_MCE_KILL_SET 1
64 #ifndef PR_MCE_KILL_EARLY
65 #define PR_MCE_KILL_EARLY 1
68 #endif /* CONFIG_LINUX */
70 static CPUState
*next_cpu
;
74 /* vcpu throttling controls */
75 static QEMUTimer
*throttle_timer
;
76 static unsigned int throttle_percentage
;
78 #define CPU_THROTTLE_PCT_MIN 1
79 #define CPU_THROTTLE_PCT_MAX 99
80 #define CPU_THROTTLE_TIMESLICE_NS 10000000
82 bool cpu_is_stopped(CPUState
*cpu
)
84 return cpu
->stopped
|| !runstate_is_running();
87 static bool cpu_thread_is_idle(CPUState
*cpu
)
89 if (cpu
->stop
|| cpu
->queued_work_first
) {
92 if (cpu_is_stopped(cpu
)) {
95 if (!cpu
->halted
|| cpu_has_work(cpu
) ||
96 kvm_halt_in_kernel()) {
102 static bool all_cpu_threads_idle(void)
107 if (!cpu_thread_is_idle(cpu
)) {
114 /***********************************************************/
115 /* guest cycle counter */
117 /* Protected by TimersState seqlock */
119 static bool icount_sleep
= true;
120 static int64_t vm_clock_warp_start
= -1;
121 /* Conversion factor from emulated instructions to virtual clock ticks. */
122 static int icount_time_shift
;
123 /* Arbitrarily pick 1MIPS as the minimum allowable speed. */
124 #define MAX_ICOUNT_SHIFT 10
126 static QEMUTimer
*icount_rt_timer
;
127 static QEMUTimer
*icount_vm_timer
;
128 static QEMUTimer
*icount_warp_timer
;
130 typedef struct TimersState
{
131 /* Protected by BQL. */
132 int64_t cpu_ticks_prev
;
133 int64_t cpu_ticks_offset
;
135 /* cpu_clock_offset can be read out of BQL, so protect it with
138 QemuSeqLock vm_clock_seqlock
;
139 int64_t cpu_clock_offset
;
140 int32_t cpu_ticks_enabled
;
143 /* Compensate for varying guest execution speed. */
144 int64_t qemu_icount_bias
;
145 /* Only written by TCG thread */
149 static TimersState timers_state
;
151 int64_t cpu_get_icount_raw(void)
154 CPUState
*cpu
= current_cpu
;
156 icount
= timers_state
.qemu_icount
;
158 if (!cpu
->can_do_io
) {
159 fprintf(stderr
, "Bad icount read\n");
162 icount
-= (cpu
->icount_decr
.u16
.low
+ cpu
->icount_extra
);
167 /* Return the virtual CPU time, based on the instruction counter. */
168 static int64_t cpu_get_icount_locked(void)
170 int64_t icount
= cpu_get_icount_raw();
171 return timers_state
.qemu_icount_bias
+ cpu_icount_to_ns(icount
);
174 int64_t cpu_get_icount(void)
180 start
= seqlock_read_begin(&timers_state
.vm_clock_seqlock
);
181 icount
= cpu_get_icount_locked();
182 } while (seqlock_read_retry(&timers_state
.vm_clock_seqlock
, start
));
187 int64_t cpu_icount_to_ns(int64_t icount
)
189 return icount
<< icount_time_shift
;
192 /* return the host CPU cycle counter and handle stop/restart */
193 /* Caller must hold the BQL */
194 int64_t cpu_get_ticks(void)
199 return cpu_get_icount();
202 ticks
= timers_state
.cpu_ticks_offset
;
203 if (timers_state
.cpu_ticks_enabled
) {
204 ticks
+= cpu_get_host_ticks();
207 if (timers_state
.cpu_ticks_prev
> ticks
) {
208 /* Note: non increasing ticks may happen if the host uses
210 timers_state
.cpu_ticks_offset
+= timers_state
.cpu_ticks_prev
- ticks
;
211 ticks
= timers_state
.cpu_ticks_prev
;
214 timers_state
.cpu_ticks_prev
= ticks
;
218 static int64_t cpu_get_clock_locked(void)
222 ticks
= timers_state
.cpu_clock_offset
;
223 if (timers_state
.cpu_ticks_enabled
) {
224 ticks
+= get_clock();
230 /* return the host CPU monotonic timer and handle stop/restart */
231 int64_t cpu_get_clock(void)
237 start
= seqlock_read_begin(&timers_state
.vm_clock_seqlock
);
238 ti
= cpu_get_clock_locked();
239 } while (seqlock_read_retry(&timers_state
.vm_clock_seqlock
, start
));
244 /* enable cpu_get_ticks()
245 * Caller must hold BQL which server as mutex for vm_clock_seqlock.
247 void cpu_enable_ticks(void)
249 /* Here, the really thing protected by seqlock is cpu_clock_offset. */
250 seqlock_write_lock(&timers_state
.vm_clock_seqlock
);
251 if (!timers_state
.cpu_ticks_enabled
) {
252 timers_state
.cpu_ticks_offset
-= cpu_get_host_ticks();
253 timers_state
.cpu_clock_offset
-= get_clock();
254 timers_state
.cpu_ticks_enabled
= 1;
256 seqlock_write_unlock(&timers_state
.vm_clock_seqlock
);
259 /* disable cpu_get_ticks() : the clock is stopped. You must not call
260 * cpu_get_ticks() after that.
261 * Caller must hold BQL which server as mutex for vm_clock_seqlock.
263 void cpu_disable_ticks(void)
265 /* Here, the really thing protected by seqlock is cpu_clock_offset. */
266 seqlock_write_lock(&timers_state
.vm_clock_seqlock
);
267 if (timers_state
.cpu_ticks_enabled
) {
268 timers_state
.cpu_ticks_offset
+= cpu_get_host_ticks();
269 timers_state
.cpu_clock_offset
= cpu_get_clock_locked();
270 timers_state
.cpu_ticks_enabled
= 0;
272 seqlock_write_unlock(&timers_state
.vm_clock_seqlock
);
275 /* Correlation between real and virtual time is always going to be
276 fairly approximate, so ignore small variation.
277 When the guest is idle real and virtual time will be aligned in
279 #define ICOUNT_WOBBLE (get_ticks_per_sec() / 10)
281 static void icount_adjust(void)
287 /* Protected by TimersState mutex. */
288 static int64_t last_delta
;
290 /* If the VM is not running, then do nothing. */
291 if (!runstate_is_running()) {
295 seqlock_write_lock(&timers_state
.vm_clock_seqlock
);
296 cur_time
= cpu_get_clock_locked();
297 cur_icount
= cpu_get_icount_locked();
299 delta
= cur_icount
- cur_time
;
300 /* FIXME: This is a very crude algorithm, somewhat prone to oscillation. */
302 && last_delta
+ ICOUNT_WOBBLE
< delta
* 2
303 && icount_time_shift
> 0) {
304 /* The guest is getting too far ahead. Slow time down. */
308 && last_delta
- ICOUNT_WOBBLE
> delta
* 2
309 && icount_time_shift
< MAX_ICOUNT_SHIFT
) {
310 /* The guest is getting too far behind. Speed time up. */
314 timers_state
.qemu_icount_bias
= cur_icount
315 - (timers_state
.qemu_icount
<< icount_time_shift
);
316 seqlock_write_unlock(&timers_state
.vm_clock_seqlock
);
319 static void icount_adjust_rt(void *opaque
)
321 timer_mod(icount_rt_timer
,
322 qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT
) + 1000);
326 static void icount_adjust_vm(void *opaque
)
328 timer_mod(icount_vm_timer
,
329 qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL
) +
330 get_ticks_per_sec() / 10);
334 static int64_t qemu_icount_round(int64_t count
)
336 return (count
+ (1 << icount_time_shift
) - 1) >> icount_time_shift
;
339 static void icount_warp_rt(void)
341 /* The icount_warp_timer is rescheduled soon after vm_clock_warp_start
342 * changes from -1 to another value, so the race here is okay.
344 if (atomic_read(&vm_clock_warp_start
) == -1) {
348 seqlock_write_lock(&timers_state
.vm_clock_seqlock
);
349 if (runstate_is_running()) {
350 int64_t clock
= REPLAY_CLOCK(REPLAY_CLOCK_VIRTUAL_RT
,
351 cpu_get_clock_locked());
354 warp_delta
= clock
- vm_clock_warp_start
;
355 if (use_icount
== 2) {
357 * In adaptive mode, do not let QEMU_CLOCK_VIRTUAL run too
358 * far ahead of real time.
360 int64_t cur_icount
= cpu_get_icount_locked();
361 int64_t delta
= clock
- cur_icount
;
362 warp_delta
= MIN(warp_delta
, delta
);
364 timers_state
.qemu_icount_bias
+= warp_delta
;
366 vm_clock_warp_start
= -1;
367 seqlock_write_unlock(&timers_state
.vm_clock_seqlock
);
369 if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL
)) {
370 qemu_clock_notify(QEMU_CLOCK_VIRTUAL
);
374 static void icount_timer_cb(void *opaque
)
376 /* No need for a checkpoint because the timer already synchronizes
377 * with CHECKPOINT_CLOCK_VIRTUAL_RT.
382 void qtest_clock_warp(int64_t dest
)
384 int64_t clock
= qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL
);
385 AioContext
*aio_context
;
386 assert(qtest_enabled());
387 aio_context
= qemu_get_aio_context();
388 while (clock
< dest
) {
389 int64_t deadline
= qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL
);
390 int64_t warp
= qemu_soonest_timeout(dest
- clock
, deadline
);
392 seqlock_write_lock(&timers_state
.vm_clock_seqlock
);
393 timers_state
.qemu_icount_bias
+= warp
;
394 seqlock_write_unlock(&timers_state
.vm_clock_seqlock
);
396 qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL
);
397 timerlist_run_timers(aio_context
->tlg
.tl
[QEMU_CLOCK_VIRTUAL
]);
398 clock
= qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL
);
400 qemu_clock_notify(QEMU_CLOCK_VIRTUAL
);
403 void qemu_start_warp_timer(void)
412 /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
413 * do not fire, so computing the deadline does not make sense.
415 if (!runstate_is_running()) {
419 /* warp clock deterministically in record/replay mode */
420 if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_START
)) {
424 if (!all_cpu_threads_idle()) {
428 if (qtest_enabled()) {
429 /* When testing, qtest commands advance icount. */
433 /* We want to use the earliest deadline from ALL vm_clocks */
434 clock
= qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT
);
435 deadline
= qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL
);
437 static bool notified
;
438 if (!icount_sleep
&& !notified
) {
439 error_report("WARNING: icount sleep disabled and no active timers");
447 * Ensure QEMU_CLOCK_VIRTUAL proceeds even when the virtual CPU goes to
448 * sleep. Otherwise, the CPU might be waiting for a future timer
449 * interrupt to wake it up, but the interrupt never comes because
450 * the vCPU isn't running any insns and thus doesn't advance the
451 * QEMU_CLOCK_VIRTUAL.
455 * We never let VCPUs sleep in no sleep icount mode.
456 * If there is a pending QEMU_CLOCK_VIRTUAL timer we just advance
457 * to the next QEMU_CLOCK_VIRTUAL event and notify it.
458 * It is useful when we want a deterministic execution time,
459 * isolated from host latencies.
461 seqlock_write_lock(&timers_state
.vm_clock_seqlock
);
462 timers_state
.qemu_icount_bias
+= deadline
;
463 seqlock_write_unlock(&timers_state
.vm_clock_seqlock
);
464 qemu_clock_notify(QEMU_CLOCK_VIRTUAL
);
467 * We do stop VCPUs and only advance QEMU_CLOCK_VIRTUAL after some
468 * "real" time, (related to the time left until the next event) has
469 * passed. The QEMU_CLOCK_VIRTUAL_RT clock will do this.
470 * This avoids that the warps are visible externally; for example,
471 * you will not be sending network packets continuously instead of
474 seqlock_write_lock(&timers_state
.vm_clock_seqlock
);
475 if (vm_clock_warp_start
== -1 || vm_clock_warp_start
> clock
) {
476 vm_clock_warp_start
= clock
;
478 seqlock_write_unlock(&timers_state
.vm_clock_seqlock
);
479 timer_mod_anticipate(icount_warp_timer
, clock
+ deadline
);
481 } else if (deadline
== 0) {
482 qemu_clock_notify(QEMU_CLOCK_VIRTUAL
);
486 static void qemu_account_warp_timer(void)
488 if (!use_icount
|| !icount_sleep
) {
492 /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
493 * do not fire, so computing the deadline does not make sense.
495 if (!runstate_is_running()) {
499 /* warp clock deterministically in record/replay mode */
500 if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_ACCOUNT
)) {
504 timer_del(icount_warp_timer
);
508 static bool icount_state_needed(void *opaque
)
514 * This is a subsection for icount migration.
516 static const VMStateDescription icount_vmstate_timers
= {
517 .name
= "timer/icount",
519 .minimum_version_id
= 1,
520 .needed
= icount_state_needed
,
521 .fields
= (VMStateField
[]) {
522 VMSTATE_INT64(qemu_icount_bias
, TimersState
),
523 VMSTATE_INT64(qemu_icount
, TimersState
),
524 VMSTATE_END_OF_LIST()
528 static const VMStateDescription vmstate_timers
= {
531 .minimum_version_id
= 1,
532 .fields
= (VMStateField
[]) {
533 VMSTATE_INT64(cpu_ticks_offset
, TimersState
),
534 VMSTATE_INT64(dummy
, TimersState
),
535 VMSTATE_INT64_V(cpu_clock_offset
, TimersState
, 2),
536 VMSTATE_END_OF_LIST()
538 .subsections
= (const VMStateDescription
*[]) {
539 &icount_vmstate_timers
,
544 static void cpu_throttle_thread(void *opaque
)
546 CPUState
*cpu
= opaque
;
548 double throttle_ratio
;
551 if (!cpu_throttle_get_percentage()) {
555 pct
= (double)cpu_throttle_get_percentage()/100;
556 throttle_ratio
= pct
/ (1 - pct
);
557 sleeptime_ns
= (long)(throttle_ratio
* CPU_THROTTLE_TIMESLICE_NS
);
559 qemu_mutex_unlock_iothread();
560 atomic_set(&cpu
->throttle_thread_scheduled
, 0);
561 g_usleep(sleeptime_ns
/ 1000); /* Convert ns to us for usleep call */
562 qemu_mutex_lock_iothread();
565 static void cpu_throttle_timer_tick(void *opaque
)
570 /* Stop the timer if needed */
571 if (!cpu_throttle_get_percentage()) {
575 if (!atomic_xchg(&cpu
->throttle_thread_scheduled
, 1)) {
576 async_run_on_cpu(cpu
, cpu_throttle_thread
, cpu
);
580 pct
= (double)cpu_throttle_get_percentage()/100;
581 timer_mod(throttle_timer
, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT
) +
582 CPU_THROTTLE_TIMESLICE_NS
/ (1-pct
));
585 void cpu_throttle_set(int new_throttle_pct
)
587 /* Ensure throttle percentage is within valid range */
588 new_throttle_pct
= MIN(new_throttle_pct
, CPU_THROTTLE_PCT_MAX
);
589 new_throttle_pct
= MAX(new_throttle_pct
, CPU_THROTTLE_PCT_MIN
);
591 atomic_set(&throttle_percentage
, new_throttle_pct
);
593 timer_mod(throttle_timer
, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT
) +
594 CPU_THROTTLE_TIMESLICE_NS
);
597 void cpu_throttle_stop(void)
599 atomic_set(&throttle_percentage
, 0);
602 bool cpu_throttle_active(void)
604 return (cpu_throttle_get_percentage() != 0);
607 int cpu_throttle_get_percentage(void)
609 return atomic_read(&throttle_percentage
);
612 void cpu_ticks_init(void)
614 seqlock_init(&timers_state
.vm_clock_seqlock
, NULL
);
615 vmstate_register(NULL
, 0, &vmstate_timers
, &timers_state
);
616 throttle_timer
= timer_new_ns(QEMU_CLOCK_VIRTUAL_RT
,
617 cpu_throttle_timer_tick
, NULL
);
620 void configure_icount(QemuOpts
*opts
, Error
**errp
)
623 char *rem_str
= NULL
;
625 option
= qemu_opt_get(opts
, "shift");
627 if (qemu_opt_get(opts
, "align") != NULL
) {
628 error_setg(errp
, "Please specify shift option when using align");
633 icount_sleep
= qemu_opt_get_bool(opts
, "sleep", true);
635 icount_warp_timer
= timer_new_ns(QEMU_CLOCK_VIRTUAL_RT
,
636 icount_timer_cb
, NULL
);
639 icount_align_option
= qemu_opt_get_bool(opts
, "align", false);
641 if (icount_align_option
&& !icount_sleep
) {
642 error_setg(errp
, "align=on and sleep=off are incompatible");
644 if (strcmp(option
, "auto") != 0) {
646 icount_time_shift
= strtol(option
, &rem_str
, 0);
647 if (errno
!= 0 || *rem_str
!= '\0' || !strlen(option
)) {
648 error_setg(errp
, "icount: Invalid shift value");
652 } else if (icount_align_option
) {
653 error_setg(errp
, "shift=auto and align=on are incompatible");
654 } else if (!icount_sleep
) {
655 error_setg(errp
, "shift=auto and sleep=off are incompatible");
660 /* 125MIPS seems a reasonable initial guess at the guest speed.
661 It will be corrected fairly quickly anyway. */
662 icount_time_shift
= 3;
664 /* Have both realtime and virtual time triggers for speed adjustment.
665 The realtime trigger catches emulated time passing too slowly,
666 the virtual time trigger catches emulated time passing too fast.
667 Realtime triggers occur even when idle, so use them less frequently
669 icount_rt_timer
= timer_new_ms(QEMU_CLOCK_VIRTUAL_RT
,
670 icount_adjust_rt
, NULL
);
671 timer_mod(icount_rt_timer
,
672 qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT
) + 1000);
673 icount_vm_timer
= timer_new_ns(QEMU_CLOCK_VIRTUAL
,
674 icount_adjust_vm
, NULL
);
675 timer_mod(icount_vm_timer
,
676 qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL
) +
677 get_ticks_per_sec() / 10);
680 /***********************************************************/
681 void hw_error(const char *fmt
, ...)
687 fprintf(stderr
, "qemu: hardware error: ");
688 vfprintf(stderr
, fmt
, ap
);
689 fprintf(stderr
, "\n");
691 fprintf(stderr
, "CPU #%d:\n", cpu
->cpu_index
);
692 cpu_dump_state(cpu
, stderr
, fprintf
, CPU_DUMP_FPU
);
698 void cpu_synchronize_all_states(void)
703 cpu_synchronize_state(cpu
);
707 void cpu_synchronize_all_post_reset(void)
712 cpu_synchronize_post_reset(cpu
);
716 void cpu_synchronize_all_post_init(void)
721 cpu_synchronize_post_init(cpu
);
725 static int do_vm_stop(RunState state
)
729 if (runstate_is_running()) {
733 vm_state_notify(0, state
);
734 qapi_event_send_stop(&error_abort
);
738 ret
= blk_flush_all();
743 static bool cpu_can_run(CPUState
*cpu
)
748 if (cpu_is_stopped(cpu
)) {
754 static void cpu_handle_guest_debug(CPUState
*cpu
)
756 gdb_set_stop_cpu(cpu
);
757 qemu_system_debug_request();
762 static void sigbus_reraise(void)
765 struct sigaction action
;
767 memset(&action
, 0, sizeof(action
));
768 action
.sa_handler
= SIG_DFL
;
769 if (!sigaction(SIGBUS
, &action
, NULL
)) {
772 sigaddset(&set
, SIGBUS
);
773 sigprocmask(SIG_UNBLOCK
, &set
, NULL
);
775 perror("Failed to re-raise SIGBUS!\n");
779 static void sigbus_handler(int n
, struct qemu_signalfd_siginfo
*siginfo
,
782 if (kvm_on_sigbus(siginfo
->ssi_code
,
783 (void *)(intptr_t)siginfo
->ssi_addr
)) {
788 static void qemu_init_sigbus(void)
790 struct sigaction action
;
792 memset(&action
, 0, sizeof(action
));
793 action
.sa_flags
= SA_SIGINFO
;
794 action
.sa_sigaction
= (void (*)(int, siginfo_t
*, void*))sigbus_handler
;
795 sigaction(SIGBUS
, &action
, NULL
);
797 prctl(PR_MCE_KILL
, PR_MCE_KILL_SET
, PR_MCE_KILL_EARLY
, 0, 0);
800 static void qemu_kvm_eat_signals(CPUState
*cpu
)
802 struct timespec ts
= { 0, 0 };
808 sigemptyset(&waitset
);
809 sigaddset(&waitset
, SIG_IPI
);
810 sigaddset(&waitset
, SIGBUS
);
813 r
= sigtimedwait(&waitset
, &siginfo
, &ts
);
814 if (r
== -1 && !(errno
== EAGAIN
|| errno
== EINTR
)) {
815 perror("sigtimedwait");
821 if (kvm_on_sigbus_vcpu(cpu
, siginfo
.si_code
, siginfo
.si_addr
)) {
829 r
= sigpending(&chkset
);
831 perror("sigpending");
834 } while (sigismember(&chkset
, SIG_IPI
) || sigismember(&chkset
, SIGBUS
));
837 #else /* !CONFIG_LINUX */
839 static void qemu_init_sigbus(void)
843 static void qemu_kvm_eat_signals(CPUState
*cpu
)
846 #endif /* !CONFIG_LINUX */
849 static void dummy_signal(int sig
)
853 static void qemu_kvm_init_cpu_signals(CPUState
*cpu
)
857 struct sigaction sigact
;
859 memset(&sigact
, 0, sizeof(sigact
));
860 sigact
.sa_handler
= dummy_signal
;
861 sigaction(SIG_IPI
, &sigact
, NULL
);
863 pthread_sigmask(SIG_BLOCK
, NULL
, &set
);
864 sigdelset(&set
, SIG_IPI
);
865 sigdelset(&set
, SIGBUS
);
866 r
= kvm_set_signal_mask(cpu
, &set
);
868 fprintf(stderr
, "kvm_set_signal_mask: %s\n", strerror(-r
));
874 static void qemu_kvm_init_cpu_signals(CPUState
*cpu
)
880 static QemuMutex qemu_global_mutex
;
881 static QemuCond qemu_io_proceeded_cond
;
882 static unsigned iothread_requesting_mutex
;
884 static QemuThread io_thread
;
887 static QemuCond qemu_cpu_cond
;
889 static QemuCond qemu_pause_cond
;
890 static QemuCond qemu_work_cond
;
892 void qemu_init_cpu_loop(void)
895 qemu_cond_init(&qemu_cpu_cond
);
896 qemu_cond_init(&qemu_pause_cond
);
897 qemu_cond_init(&qemu_work_cond
);
898 qemu_cond_init(&qemu_io_proceeded_cond
);
899 qemu_mutex_init(&qemu_global_mutex
);
901 qemu_thread_get_self(&io_thread
);
904 void run_on_cpu(CPUState
*cpu
, void (*func
)(void *data
), void *data
)
906 struct qemu_work_item wi
;
908 if (qemu_cpu_is_self(cpu
)) {
917 qemu_mutex_lock(&cpu
->work_mutex
);
918 if (cpu
->queued_work_first
== NULL
) {
919 cpu
->queued_work_first
= &wi
;
921 cpu
->queued_work_last
->next
= &wi
;
923 cpu
->queued_work_last
= &wi
;
926 qemu_mutex_unlock(&cpu
->work_mutex
);
929 while (!atomic_mb_read(&wi
.done
)) {
930 CPUState
*self_cpu
= current_cpu
;
932 qemu_cond_wait(&qemu_work_cond
, &qemu_global_mutex
);
933 current_cpu
= self_cpu
;
937 void async_run_on_cpu(CPUState
*cpu
, void (*func
)(void *data
), void *data
)
939 struct qemu_work_item
*wi
;
941 if (qemu_cpu_is_self(cpu
)) {
946 wi
= g_malloc0(sizeof(struct qemu_work_item
));
951 qemu_mutex_lock(&cpu
->work_mutex
);
952 if (cpu
->queued_work_first
== NULL
) {
953 cpu
->queued_work_first
= wi
;
955 cpu
->queued_work_last
->next
= wi
;
957 cpu
->queued_work_last
= wi
;
960 qemu_mutex_unlock(&cpu
->work_mutex
);
965 static void flush_queued_work(CPUState
*cpu
)
967 struct qemu_work_item
*wi
;
969 if (cpu
->queued_work_first
== NULL
) {
973 qemu_mutex_lock(&cpu
->work_mutex
);
974 while (cpu
->queued_work_first
!= NULL
) {
975 wi
= cpu
->queued_work_first
;
976 cpu
->queued_work_first
= wi
->next
;
977 if (!cpu
->queued_work_first
) {
978 cpu
->queued_work_last
= NULL
;
980 qemu_mutex_unlock(&cpu
->work_mutex
);
982 qemu_mutex_lock(&cpu
->work_mutex
);
986 atomic_mb_set(&wi
->done
, true);
989 qemu_mutex_unlock(&cpu
->work_mutex
);
990 qemu_cond_broadcast(&qemu_work_cond
);
993 static void qemu_wait_io_event_common(CPUState
*cpu
)
998 qemu_cond_broadcast(&qemu_pause_cond
);
1000 flush_queued_work(cpu
);
1001 cpu
->thread_kicked
= false;
1004 static void qemu_tcg_wait_io_event(CPUState
*cpu
)
1006 while (all_cpu_threads_idle()) {
1007 qemu_cond_wait(cpu
->halt_cond
, &qemu_global_mutex
);
1010 while (iothread_requesting_mutex
) {
1011 qemu_cond_wait(&qemu_io_proceeded_cond
, &qemu_global_mutex
);
1015 qemu_wait_io_event_common(cpu
);
1019 static void qemu_kvm_wait_io_event(CPUState
*cpu
)
1021 while (cpu_thread_is_idle(cpu
)) {
1022 qemu_cond_wait(cpu
->halt_cond
, &qemu_global_mutex
);
1025 qemu_kvm_eat_signals(cpu
);
1026 qemu_wait_io_event_common(cpu
);
1029 static void *qemu_kvm_cpu_thread_fn(void *arg
)
1031 CPUState
*cpu
= arg
;
1034 rcu_register_thread();
1036 qemu_mutex_lock_iothread();
1037 qemu_thread_get_self(cpu
->thread
);
1038 cpu
->thread_id
= qemu_get_thread_id();
1042 r
= kvm_init_vcpu(cpu
);
1044 fprintf(stderr
, "kvm_init_vcpu failed: %s\n", strerror(-r
));
1048 qemu_kvm_init_cpu_signals(cpu
);
1050 /* signal CPU creation */
1051 cpu
->created
= true;
1052 qemu_cond_signal(&qemu_cpu_cond
);
1055 if (cpu_can_run(cpu
)) {
1056 r
= kvm_cpu_exec(cpu
);
1057 if (r
== EXCP_DEBUG
) {
1058 cpu_handle_guest_debug(cpu
);
1061 qemu_kvm_wait_io_event(cpu
);
1067 static void *qemu_dummy_cpu_thread_fn(void *arg
)
1070 fprintf(stderr
, "qtest is not supported under Windows\n");
1073 CPUState
*cpu
= arg
;
1077 rcu_register_thread();
1079 qemu_mutex_lock_iothread();
1080 qemu_thread_get_self(cpu
->thread
);
1081 cpu
->thread_id
= qemu_get_thread_id();
1084 sigemptyset(&waitset
);
1085 sigaddset(&waitset
, SIG_IPI
);
1087 /* signal CPU creation */
1088 cpu
->created
= true;
1089 qemu_cond_signal(&qemu_cpu_cond
);
1094 qemu_mutex_unlock_iothread();
1097 r
= sigwait(&waitset
, &sig
);
1098 } while (r
== -1 && (errno
== EAGAIN
|| errno
== EINTR
));
1103 qemu_mutex_lock_iothread();
1105 qemu_wait_io_event_common(cpu
);
1112 static void tcg_exec_all(void);
1114 static void *qemu_tcg_cpu_thread_fn(void *arg
)
1116 CPUState
*cpu
= arg
;
1118 rcu_register_thread();
1120 qemu_mutex_lock_iothread();
1121 qemu_thread_get_self(cpu
->thread
);
1124 cpu
->thread_id
= qemu_get_thread_id();
1125 cpu
->created
= true;
1128 qemu_cond_signal(&qemu_cpu_cond
);
1130 /* wait for initial kick-off after machine start */
1131 while (first_cpu
->stopped
) {
1132 qemu_cond_wait(first_cpu
->halt_cond
, &qemu_global_mutex
);
1134 /* process any pending work */
1136 qemu_wait_io_event_common(cpu
);
1140 /* process any pending work */
1141 atomic_mb_set(&exit_request
, 1);
1147 int64_t deadline
= qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL
);
1149 if (deadline
== 0) {
1150 qemu_clock_notify(QEMU_CLOCK_VIRTUAL
);
1153 qemu_tcg_wait_io_event(QTAILQ_FIRST(&cpus
));
1159 static void qemu_cpu_kick_thread(CPUState
*cpu
)
1164 if (cpu
->thread_kicked
) {
1167 cpu
->thread_kicked
= true;
1168 err
= pthread_kill(cpu
->thread
->thread
, SIG_IPI
);
1170 fprintf(stderr
, "qemu:%s: %s", __func__
, strerror(err
));
1178 static void qemu_cpu_kick_no_halt(void)
1181 /* Ensure whatever caused the exit has reached the CPU threads before
1182 * writing exit_request.
1184 atomic_mb_set(&exit_request
, 1);
1185 cpu
= atomic_mb_read(&tcg_current_cpu
);
1191 void qemu_cpu_kick(CPUState
*cpu
)
1193 qemu_cond_broadcast(cpu
->halt_cond
);
1194 if (tcg_enabled()) {
1195 qemu_cpu_kick_no_halt();
1197 qemu_cpu_kick_thread(cpu
);
1201 void qemu_cpu_kick_self(void)
1203 assert(current_cpu
);
1204 qemu_cpu_kick_thread(current_cpu
);
1207 bool qemu_cpu_is_self(CPUState
*cpu
)
1209 return qemu_thread_is_self(cpu
->thread
);
1212 bool qemu_in_vcpu_thread(void)
1214 return current_cpu
&& qemu_cpu_is_self(current_cpu
);
1217 static __thread
bool iothread_locked
= false;
1219 bool qemu_mutex_iothread_locked(void)
1221 return iothread_locked
;
1224 void qemu_mutex_lock_iothread(void)
1226 atomic_inc(&iothread_requesting_mutex
);
1227 /* In the simple case there is no need to bump the VCPU thread out of
1228 * TCG code execution.
1230 if (!tcg_enabled() || qemu_in_vcpu_thread() ||
1231 !first_cpu
|| !first_cpu
->created
) {
1232 qemu_mutex_lock(&qemu_global_mutex
);
1233 atomic_dec(&iothread_requesting_mutex
);
1235 if (qemu_mutex_trylock(&qemu_global_mutex
)) {
1236 qemu_cpu_kick_no_halt();
1237 qemu_mutex_lock(&qemu_global_mutex
);
1239 atomic_dec(&iothread_requesting_mutex
);
1240 qemu_cond_broadcast(&qemu_io_proceeded_cond
);
1242 iothread_locked
= true;
1245 void qemu_mutex_unlock_iothread(void)
1247 iothread_locked
= false;
1248 qemu_mutex_unlock(&qemu_global_mutex
);
1251 static int all_vcpus_paused(void)
1256 if (!cpu
->stopped
) {
1264 void pause_all_vcpus(void)
1268 qemu_clock_enable(QEMU_CLOCK_VIRTUAL
, false);
1274 if (qemu_in_vcpu_thread()) {
1276 if (!kvm_enabled()) {
1279 cpu
->stopped
= true;
1285 while (!all_vcpus_paused()) {
1286 qemu_cond_wait(&qemu_pause_cond
, &qemu_global_mutex
);
1293 void cpu_resume(CPUState
*cpu
)
1296 cpu
->stopped
= false;
1300 void resume_all_vcpus(void)
1304 qemu_clock_enable(QEMU_CLOCK_VIRTUAL
, true);
1310 /* For temporary buffers for forming a name */
1311 #define VCPU_THREAD_NAME_SIZE 16
1313 static void qemu_tcg_init_vcpu(CPUState
*cpu
)
1315 char thread_name
[VCPU_THREAD_NAME_SIZE
];
1316 static QemuCond
*tcg_halt_cond
;
1317 static QemuThread
*tcg_cpu_thread
;
1319 /* share a single thread for all cpus with TCG */
1320 if (!tcg_cpu_thread
) {
1321 cpu
->thread
= g_malloc0(sizeof(QemuThread
));
1322 cpu
->halt_cond
= g_malloc0(sizeof(QemuCond
));
1323 qemu_cond_init(cpu
->halt_cond
);
1324 tcg_halt_cond
= cpu
->halt_cond
;
1325 snprintf(thread_name
, VCPU_THREAD_NAME_SIZE
, "CPU %d/TCG",
1327 qemu_thread_create(cpu
->thread
, thread_name
, qemu_tcg_cpu_thread_fn
,
1328 cpu
, QEMU_THREAD_JOINABLE
);
1330 cpu
->hThread
= qemu_thread_get_handle(cpu
->thread
);
1332 while (!cpu
->created
) {
1333 qemu_cond_wait(&qemu_cpu_cond
, &qemu_global_mutex
);
1335 tcg_cpu_thread
= cpu
->thread
;
1337 cpu
->thread
= tcg_cpu_thread
;
1338 cpu
->halt_cond
= tcg_halt_cond
;
1342 static void qemu_kvm_start_vcpu(CPUState
*cpu
)
1344 char thread_name
[VCPU_THREAD_NAME_SIZE
];
1346 cpu
->thread
= g_malloc0(sizeof(QemuThread
));
1347 cpu
->halt_cond
= g_malloc0(sizeof(QemuCond
));
1348 qemu_cond_init(cpu
->halt_cond
);
1349 snprintf(thread_name
, VCPU_THREAD_NAME_SIZE
, "CPU %d/KVM",
1351 qemu_thread_create(cpu
->thread
, thread_name
, qemu_kvm_cpu_thread_fn
,
1352 cpu
, QEMU_THREAD_JOINABLE
);
1353 while (!cpu
->created
) {
1354 qemu_cond_wait(&qemu_cpu_cond
, &qemu_global_mutex
);
1358 static void qemu_dummy_start_vcpu(CPUState
*cpu
)
1360 char thread_name
[VCPU_THREAD_NAME_SIZE
];
1362 cpu
->thread
= g_malloc0(sizeof(QemuThread
));
1363 cpu
->halt_cond
= g_malloc0(sizeof(QemuCond
));
1364 qemu_cond_init(cpu
->halt_cond
);
1365 snprintf(thread_name
, VCPU_THREAD_NAME_SIZE
, "CPU %d/DUMMY",
1367 qemu_thread_create(cpu
->thread
, thread_name
, qemu_dummy_cpu_thread_fn
, cpu
,
1368 QEMU_THREAD_JOINABLE
);
1369 while (!cpu
->created
) {
1370 qemu_cond_wait(&qemu_cpu_cond
, &qemu_global_mutex
);
1374 void qemu_init_vcpu(CPUState
*cpu
)
1376 cpu
->nr_cores
= smp_cores
;
1377 cpu
->nr_threads
= smp_threads
;
1378 cpu
->stopped
= true;
1381 /* If the target cpu hasn't set up any address spaces itself,
1382 * give it the default one.
1384 AddressSpace
*as
= address_space_init_shareable(cpu
->memory
,
1387 cpu_address_space_init(cpu
, as
, 0);
1390 if (kvm_enabled()) {
1391 qemu_kvm_start_vcpu(cpu
);
1392 } else if (tcg_enabled()) {
1393 qemu_tcg_init_vcpu(cpu
);
1395 qemu_dummy_start_vcpu(cpu
);
1399 void cpu_stop_current(void)
1402 current_cpu
->stop
= false;
1403 current_cpu
->stopped
= true;
1404 cpu_exit(current_cpu
);
1405 qemu_cond_broadcast(&qemu_pause_cond
);
1409 int vm_stop(RunState state
)
1411 if (qemu_in_vcpu_thread()) {
1412 qemu_system_vmstop_request_prepare();
1413 qemu_system_vmstop_request(state
);
1415 * FIXME: should not return to device code in case
1416 * vm_stop() has been requested.
1422 return do_vm_stop(state
);
1425 /* does a state transition even if the VM is already stopped,
1426 current state is forgotten forever */
1427 int vm_stop_force_state(RunState state
)
1429 if (runstate_is_running()) {
1430 return vm_stop(state
);
1432 runstate_set(state
);
1435 /* Make sure to return an error if the flush in a previous vm_stop()
1437 return blk_flush_all();
1441 static int64_t tcg_get_icount_limit(void)
1445 if (replay_mode
!= REPLAY_MODE_PLAY
) {
1446 deadline
= qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL
);
1448 /* Maintain prior (possibly buggy) behaviour where if no deadline
1449 * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
1450 * INT32_MAX nanoseconds ahead, we still use INT32_MAX
1453 if ((deadline
< 0) || (deadline
> INT32_MAX
)) {
1454 deadline
= INT32_MAX
;
1457 return qemu_icount_round(deadline
);
1459 return replay_get_instructions();
1463 static int tcg_cpu_exec(CPUState
*cpu
)
1466 #ifdef CONFIG_PROFILER
1470 #ifdef CONFIG_PROFILER
1471 ti
= profile_getclock();
1476 timers_state
.qemu_icount
-= (cpu
->icount_decr
.u16
.low
1477 + cpu
->icount_extra
);
1478 cpu
->icount_decr
.u16
.low
= 0;
1479 cpu
->icount_extra
= 0;
1480 count
= tcg_get_icount_limit();
1481 timers_state
.qemu_icount
+= count
;
1482 decr
= (count
> 0xffff) ? 0xffff : count
;
1484 cpu
->icount_decr
.u16
.low
= decr
;
1485 cpu
->icount_extra
= count
;
1487 ret
= cpu_exec(cpu
);
1488 #ifdef CONFIG_PROFILER
1489 tcg_time
+= profile_getclock() - ti
;
1492 /* Fold pending instructions back into the
1493 instruction counter, and clear the interrupt flag. */
1494 timers_state
.qemu_icount
-= (cpu
->icount_decr
.u16
.low
1495 + cpu
->icount_extra
);
1496 cpu
->icount_decr
.u32
= 0;
1497 cpu
->icount_extra
= 0;
1498 replay_account_executed_instructions();
1503 static void tcg_exec_all(void)
1507 /* Account partial waits to QEMU_CLOCK_VIRTUAL. */
1508 qemu_account_warp_timer();
1510 if (next_cpu
== NULL
) {
1511 next_cpu
= first_cpu
;
1513 for (; next_cpu
!= NULL
&& !exit_request
; next_cpu
= CPU_NEXT(next_cpu
)) {
1514 CPUState
*cpu
= next_cpu
;
1516 qemu_clock_enable(QEMU_CLOCK_VIRTUAL
,
1517 (cpu
->singlestep_enabled
& SSTEP_NOTIMER
) == 0);
1519 if (cpu_can_run(cpu
)) {
1520 r
= tcg_cpu_exec(cpu
);
1521 if (r
== EXCP_DEBUG
) {
1522 cpu_handle_guest_debug(cpu
);
1525 } else if (cpu
->stop
|| cpu
->stopped
) {
1530 /* Pairs with smp_wmb in qemu_cpu_kick. */
1531 atomic_mb_set(&exit_request
, 0);
1534 void list_cpus(FILE *f
, fprintf_function cpu_fprintf
, const char *optarg
)
1536 /* XXX: implement xxx_cpu_list for targets that still miss it */
1537 #if defined(cpu_list)
1538 cpu_list(f
, cpu_fprintf
);
1542 CpuInfoList
*qmp_query_cpus(Error
**errp
)
1544 CpuInfoList
*head
= NULL
, *cur_item
= NULL
;
1549 #if defined(TARGET_I386)
1550 X86CPU
*x86_cpu
= X86_CPU(cpu
);
1551 CPUX86State
*env
= &x86_cpu
->env
;
1552 #elif defined(TARGET_PPC)
1553 PowerPCCPU
*ppc_cpu
= POWERPC_CPU(cpu
);
1554 CPUPPCState
*env
= &ppc_cpu
->env
;
1555 #elif defined(TARGET_SPARC)
1556 SPARCCPU
*sparc_cpu
= SPARC_CPU(cpu
);
1557 CPUSPARCState
*env
= &sparc_cpu
->env
;
1558 #elif defined(TARGET_MIPS)
1559 MIPSCPU
*mips_cpu
= MIPS_CPU(cpu
);
1560 CPUMIPSState
*env
= &mips_cpu
->env
;
1561 #elif defined(TARGET_TRICORE)
1562 TriCoreCPU
*tricore_cpu
= TRICORE_CPU(cpu
);
1563 CPUTriCoreState
*env
= &tricore_cpu
->env
;
1566 cpu_synchronize_state(cpu
);
1568 info
= g_malloc0(sizeof(*info
));
1569 info
->value
= g_malloc0(sizeof(*info
->value
));
1570 info
->value
->CPU
= cpu
->cpu_index
;
1571 info
->value
->current
= (cpu
== first_cpu
);
1572 info
->value
->halted
= cpu
->halted
;
1573 info
->value
->qom_path
= object_get_canonical_path(OBJECT(cpu
));
1574 info
->value
->thread_id
= cpu
->thread_id
;
1575 #if defined(TARGET_I386)
1576 info
->value
->arch
= CPU_INFO_ARCH_X86
;
1577 info
->value
->u
.x86
.pc
= env
->eip
+ env
->segs
[R_CS
].base
;
1578 #elif defined(TARGET_PPC)
1579 info
->value
->arch
= CPU_INFO_ARCH_PPC
;
1580 info
->value
->u
.ppc
.nip
= env
->nip
;
1581 #elif defined(TARGET_SPARC)
1582 info
->value
->arch
= CPU_INFO_ARCH_SPARC
;
1583 info
->value
->u
.q_sparc
.pc
= env
->pc
;
1584 info
->value
->u
.q_sparc
.npc
= env
->npc
;
1585 #elif defined(TARGET_MIPS)
1586 info
->value
->arch
= CPU_INFO_ARCH_MIPS
;
1587 info
->value
->u
.q_mips
.PC
= env
->active_tc
.PC
;
1588 #elif defined(TARGET_TRICORE)
1589 info
->value
->arch
= CPU_INFO_ARCH_TRICORE
;
1590 info
->value
->u
.tricore
.PC
= env
->PC
;
1592 info
->value
->arch
= CPU_INFO_ARCH_OTHER
;
1595 /* XXX: waiting for the qapi to support GSList */
1597 head
= cur_item
= info
;
1599 cur_item
->next
= info
;
1607 void qmp_memsave(int64_t addr
, int64_t size
, const char *filename
,
1608 bool has_cpu
, int64_t cpu_index
, Error
**errp
)
1614 int64_t orig_addr
= addr
, orig_size
= size
;
1620 cpu
= qemu_get_cpu(cpu_index
);
1622 error_setg(errp
, QERR_INVALID_PARAMETER_VALUE
, "cpu-index",
1627 f
= fopen(filename
, "wb");
1629 error_setg_file_open(errp
, errno
, filename
);
1637 if (cpu_memory_rw_debug(cpu
, addr
, buf
, l
, 0) != 0) {
1638 error_setg(errp
, "Invalid addr 0x%016" PRIx64
"/size %" PRId64
1639 " specified", orig_addr
, orig_size
);
1642 if (fwrite(buf
, 1, l
, f
) != l
) {
1643 error_setg(errp
, QERR_IO_ERROR
);
1654 void qmp_pmemsave(int64_t addr
, int64_t size
, const char *filename
,
1661 f
= fopen(filename
, "wb");
1663 error_setg_file_open(errp
, errno
, filename
);
1671 cpu_physical_memory_read(addr
, buf
, l
);
1672 if (fwrite(buf
, 1, l
, f
) != l
) {
1673 error_setg(errp
, QERR_IO_ERROR
);
1684 void qmp_inject_nmi(Error
**errp
)
1686 #if defined(TARGET_I386)
1690 X86CPU
*cpu
= X86_CPU(cs
);
1692 if (!cpu
->apic_state
) {
1693 cpu_interrupt(cs
, CPU_INTERRUPT_NMI
);
1695 apic_deliver_nmi(cpu
->apic_state
);
1699 nmi_monitor_handle(monitor_get_cpu_index(), errp
);
1703 void dump_drift_info(FILE *f
, fprintf_function cpu_fprintf
)
1709 cpu_fprintf(f
, "Host - Guest clock %"PRIi64
" ms\n",
1710 (cpu_get_clock() - cpu_get_icount())/SCALE_MS
);
1711 if (icount_align_option
) {
1712 cpu_fprintf(f
, "Max guest delay %"PRIi64
" ms\n", -max_delay
/SCALE_MS
);
1713 cpu_fprintf(f
, "Max guest advance %"PRIi64
" ms\n", max_advance
/SCALE_MS
);
1715 cpu_fprintf(f
, "Max guest delay NA\n");
1716 cpu_fprintf(f
, "Max guest advance NA\n");