4 * Copyright (c) 2003-2008 Fabrice Bellard
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
25 /* Needed early for CONFIG_BSD etc. */
26 #include "qemu/osdep.h"
28 #include "monitor/monitor.h"
29 #include "qapi/qmp/qerror.h"
30 #include "qemu/error-report.h"
31 #include "sysemu/sysemu.h"
32 #include "sysemu/block-backend.h"
33 #include "exec/gdbstub.h"
34 #include "sysemu/dma.h"
35 #include "sysemu/kvm.h"
36 #include "qmp-commands.h"
38 #include "qemu/thread.h"
39 #include "sysemu/cpus.h"
40 #include "sysemu/qtest.h"
41 #include "qemu/main-loop.h"
42 #include "qemu/bitmap.h"
43 #include "qemu/seqlock.h"
44 #include "qapi-event.h"
46 #include "sysemu/replay.h"
49 #include "qemu/compatfd.h"
54 #include <sys/prctl.h>
57 #define PR_MCE_KILL 33
60 #ifndef PR_MCE_KILL_SET
61 #define PR_MCE_KILL_SET 1
64 #ifndef PR_MCE_KILL_EARLY
65 #define PR_MCE_KILL_EARLY 1
68 #endif /* CONFIG_LINUX */
70 static CPUState
*next_cpu
;
74 /* vcpu throttling controls */
75 static QEMUTimer
*throttle_timer
;
76 static unsigned int throttle_percentage
;
78 #define CPU_THROTTLE_PCT_MIN 1
79 #define CPU_THROTTLE_PCT_MAX 99
80 #define CPU_THROTTLE_TIMESLICE_NS 10000000
82 bool cpu_is_stopped(CPUState
*cpu
)
84 return cpu
->stopped
|| !runstate_is_running();
87 static bool cpu_thread_is_idle(CPUState
*cpu
)
89 if (cpu
->stop
|| cpu
->queued_work_first
) {
92 if (cpu_is_stopped(cpu
)) {
95 if (!cpu
->halted
|| cpu_has_work(cpu
) ||
96 kvm_halt_in_kernel()) {
102 static bool all_cpu_threads_idle(void)
107 if (!cpu_thread_is_idle(cpu
)) {
114 /***********************************************************/
115 /* guest cycle counter */
117 /* Protected by TimersState seqlock */
119 static bool icount_sleep
= true;
120 static int64_t vm_clock_warp_start
= -1;
121 /* Conversion factor from emulated instructions to virtual clock ticks. */
122 static int icount_time_shift
;
123 /* Arbitrarily pick 1MIPS as the minimum allowable speed. */
124 #define MAX_ICOUNT_SHIFT 10
126 static QEMUTimer
*icount_rt_timer
;
127 static QEMUTimer
*icount_vm_timer
;
128 static QEMUTimer
*icount_warp_timer
;
130 typedef struct TimersState
{
131 /* Protected by BQL. */
132 int64_t cpu_ticks_prev
;
133 int64_t cpu_ticks_offset
;
135 /* cpu_clock_offset can be read out of BQL, so protect it with
138 QemuSeqLock vm_clock_seqlock
;
139 int64_t cpu_clock_offset
;
140 int32_t cpu_ticks_enabled
;
143 /* Compensate for varying guest execution speed. */
144 int64_t qemu_icount_bias
;
145 /* Only written by TCG thread */
149 static TimersState timers_state
;
151 int64_t cpu_get_icount_raw(void)
154 CPUState
*cpu
= current_cpu
;
156 icount
= timers_state
.qemu_icount
;
158 if (!cpu
->can_do_io
) {
159 fprintf(stderr
, "Bad icount read\n");
162 icount
-= (cpu
->icount_decr
.u16
.low
+ cpu
->icount_extra
);
167 /* Return the virtual CPU time, based on the instruction counter. */
168 static int64_t cpu_get_icount_locked(void)
170 int64_t icount
= cpu_get_icount_raw();
171 return timers_state
.qemu_icount_bias
+ cpu_icount_to_ns(icount
);
174 int64_t cpu_get_icount(void)
180 start
= seqlock_read_begin(&timers_state
.vm_clock_seqlock
);
181 icount
= cpu_get_icount_locked();
182 } while (seqlock_read_retry(&timers_state
.vm_clock_seqlock
, start
));
187 int64_t cpu_icount_to_ns(int64_t icount
)
189 return icount
<< icount_time_shift
;
192 /* return the host CPU cycle counter and handle stop/restart */
193 /* Caller must hold the BQL */
194 int64_t cpu_get_ticks(void)
199 return cpu_get_icount();
202 ticks
= timers_state
.cpu_ticks_offset
;
203 if (timers_state
.cpu_ticks_enabled
) {
204 ticks
+= cpu_get_host_ticks();
207 if (timers_state
.cpu_ticks_prev
> ticks
) {
208 /* Note: non increasing ticks may happen if the host uses
210 timers_state
.cpu_ticks_offset
+= timers_state
.cpu_ticks_prev
- ticks
;
211 ticks
= timers_state
.cpu_ticks_prev
;
214 timers_state
.cpu_ticks_prev
= ticks
;
218 static int64_t cpu_get_clock_locked(void)
222 ticks
= timers_state
.cpu_clock_offset
;
223 if (timers_state
.cpu_ticks_enabled
) {
224 ticks
+= get_clock();
230 /* return the host CPU monotonic timer and handle stop/restart */
231 int64_t cpu_get_clock(void)
237 start
= seqlock_read_begin(&timers_state
.vm_clock_seqlock
);
238 ti
= cpu_get_clock_locked();
239 } while (seqlock_read_retry(&timers_state
.vm_clock_seqlock
, start
));
244 /* enable cpu_get_ticks()
245 * Caller must hold BQL which server as mutex for vm_clock_seqlock.
247 void cpu_enable_ticks(void)
249 /* Here, the really thing protected by seqlock is cpu_clock_offset. */
250 seqlock_write_lock(&timers_state
.vm_clock_seqlock
);
251 if (!timers_state
.cpu_ticks_enabled
) {
252 timers_state
.cpu_ticks_offset
-= cpu_get_host_ticks();
253 timers_state
.cpu_clock_offset
-= get_clock();
254 timers_state
.cpu_ticks_enabled
= 1;
256 seqlock_write_unlock(&timers_state
.vm_clock_seqlock
);
259 /* disable cpu_get_ticks() : the clock is stopped. You must not call
260 * cpu_get_ticks() after that.
261 * Caller must hold BQL which server as mutex for vm_clock_seqlock.
263 void cpu_disable_ticks(void)
265 /* Here, the really thing protected by seqlock is cpu_clock_offset. */
266 seqlock_write_lock(&timers_state
.vm_clock_seqlock
);
267 if (timers_state
.cpu_ticks_enabled
) {
268 timers_state
.cpu_ticks_offset
+= cpu_get_host_ticks();
269 timers_state
.cpu_clock_offset
= cpu_get_clock_locked();
270 timers_state
.cpu_ticks_enabled
= 0;
272 seqlock_write_unlock(&timers_state
.vm_clock_seqlock
);
275 /* Correlation between real and virtual time is always going to be
276 fairly approximate, so ignore small variation.
277 When the guest is idle real and virtual time will be aligned in
279 #define ICOUNT_WOBBLE (NANOSECONDS_PER_SECOND / 10)
281 static void icount_adjust(void)
287 /* Protected by TimersState mutex. */
288 static int64_t last_delta
;
290 /* If the VM is not running, then do nothing. */
291 if (!runstate_is_running()) {
295 seqlock_write_lock(&timers_state
.vm_clock_seqlock
);
296 cur_time
= cpu_get_clock_locked();
297 cur_icount
= cpu_get_icount_locked();
299 delta
= cur_icount
- cur_time
;
300 /* FIXME: This is a very crude algorithm, somewhat prone to oscillation. */
302 && last_delta
+ ICOUNT_WOBBLE
< delta
* 2
303 && icount_time_shift
> 0) {
304 /* The guest is getting too far ahead. Slow time down. */
308 && last_delta
- ICOUNT_WOBBLE
> delta
* 2
309 && icount_time_shift
< MAX_ICOUNT_SHIFT
) {
310 /* The guest is getting too far behind. Speed time up. */
314 timers_state
.qemu_icount_bias
= cur_icount
315 - (timers_state
.qemu_icount
<< icount_time_shift
);
316 seqlock_write_unlock(&timers_state
.vm_clock_seqlock
);
319 static void icount_adjust_rt(void *opaque
)
321 timer_mod(icount_rt_timer
,
322 qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT
) + 1000);
326 static void icount_adjust_vm(void *opaque
)
328 timer_mod(icount_vm_timer
,
329 qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL
) +
330 NANOSECONDS_PER_SECOND
/ 10);
334 static int64_t qemu_icount_round(int64_t count
)
336 return (count
+ (1 << icount_time_shift
) - 1) >> icount_time_shift
;
339 static void icount_warp_rt(void)
344 /* The icount_warp_timer is rescheduled soon after vm_clock_warp_start
345 * changes from -1 to another value, so the race here is okay.
348 seq
= seqlock_read_begin(&timers_state
.vm_clock_seqlock
);
349 warp_start
= vm_clock_warp_start
;
350 } while (seqlock_read_retry(&timers_state
.vm_clock_seqlock
, seq
));
352 if (warp_start
== -1) {
356 seqlock_write_lock(&timers_state
.vm_clock_seqlock
);
357 if (runstate_is_running()) {
358 int64_t clock
= REPLAY_CLOCK(REPLAY_CLOCK_VIRTUAL_RT
,
359 cpu_get_clock_locked());
362 warp_delta
= clock
- vm_clock_warp_start
;
363 if (use_icount
== 2) {
365 * In adaptive mode, do not let QEMU_CLOCK_VIRTUAL run too
366 * far ahead of real time.
368 int64_t cur_icount
= cpu_get_icount_locked();
369 int64_t delta
= clock
- cur_icount
;
370 warp_delta
= MIN(warp_delta
, delta
);
372 timers_state
.qemu_icount_bias
+= warp_delta
;
374 vm_clock_warp_start
= -1;
375 seqlock_write_unlock(&timers_state
.vm_clock_seqlock
);
377 if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL
)) {
378 qemu_clock_notify(QEMU_CLOCK_VIRTUAL
);
382 static void icount_timer_cb(void *opaque
)
384 /* No need for a checkpoint because the timer already synchronizes
385 * with CHECKPOINT_CLOCK_VIRTUAL_RT.
390 void qtest_clock_warp(int64_t dest
)
392 int64_t clock
= qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL
);
393 AioContext
*aio_context
;
394 assert(qtest_enabled());
395 aio_context
= qemu_get_aio_context();
396 while (clock
< dest
) {
397 int64_t deadline
= qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL
);
398 int64_t warp
= qemu_soonest_timeout(dest
- clock
, deadline
);
400 seqlock_write_lock(&timers_state
.vm_clock_seqlock
);
401 timers_state
.qemu_icount_bias
+= warp
;
402 seqlock_write_unlock(&timers_state
.vm_clock_seqlock
);
404 qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL
);
405 timerlist_run_timers(aio_context
->tlg
.tl
[QEMU_CLOCK_VIRTUAL
]);
406 clock
= qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL
);
408 qemu_clock_notify(QEMU_CLOCK_VIRTUAL
);
411 void qemu_start_warp_timer(void)
420 /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
421 * do not fire, so computing the deadline does not make sense.
423 if (!runstate_is_running()) {
427 /* warp clock deterministically in record/replay mode */
428 if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_START
)) {
432 if (!all_cpu_threads_idle()) {
436 if (qtest_enabled()) {
437 /* When testing, qtest commands advance icount. */
441 /* We want to use the earliest deadline from ALL vm_clocks */
442 clock
= qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT
);
443 deadline
= qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL
);
445 static bool notified
;
446 if (!icount_sleep
&& !notified
) {
447 error_report("WARNING: icount sleep disabled and no active timers");
455 * Ensure QEMU_CLOCK_VIRTUAL proceeds even when the virtual CPU goes to
456 * sleep. Otherwise, the CPU might be waiting for a future timer
457 * interrupt to wake it up, but the interrupt never comes because
458 * the vCPU isn't running any insns and thus doesn't advance the
459 * QEMU_CLOCK_VIRTUAL.
463 * We never let VCPUs sleep in no sleep icount mode.
464 * If there is a pending QEMU_CLOCK_VIRTUAL timer we just advance
465 * to the next QEMU_CLOCK_VIRTUAL event and notify it.
466 * It is useful when we want a deterministic execution time,
467 * isolated from host latencies.
469 seqlock_write_lock(&timers_state
.vm_clock_seqlock
);
470 timers_state
.qemu_icount_bias
+= deadline
;
471 seqlock_write_unlock(&timers_state
.vm_clock_seqlock
);
472 qemu_clock_notify(QEMU_CLOCK_VIRTUAL
);
475 * We do stop VCPUs and only advance QEMU_CLOCK_VIRTUAL after some
476 * "real" time, (related to the time left until the next event) has
477 * passed. The QEMU_CLOCK_VIRTUAL_RT clock will do this.
478 * This avoids that the warps are visible externally; for example,
479 * you will not be sending network packets continuously instead of
482 seqlock_write_lock(&timers_state
.vm_clock_seqlock
);
483 if (vm_clock_warp_start
== -1 || vm_clock_warp_start
> clock
) {
484 vm_clock_warp_start
= clock
;
486 seqlock_write_unlock(&timers_state
.vm_clock_seqlock
);
487 timer_mod_anticipate(icount_warp_timer
, clock
+ deadline
);
489 } else if (deadline
== 0) {
490 qemu_clock_notify(QEMU_CLOCK_VIRTUAL
);
494 static void qemu_account_warp_timer(void)
496 if (!use_icount
|| !icount_sleep
) {
500 /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
501 * do not fire, so computing the deadline does not make sense.
503 if (!runstate_is_running()) {
507 /* warp clock deterministically in record/replay mode */
508 if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_ACCOUNT
)) {
512 timer_del(icount_warp_timer
);
516 static bool icount_state_needed(void *opaque
)
522 * This is a subsection for icount migration.
524 static const VMStateDescription icount_vmstate_timers
= {
525 .name
= "timer/icount",
527 .minimum_version_id
= 1,
528 .needed
= icount_state_needed
,
529 .fields
= (VMStateField
[]) {
530 VMSTATE_INT64(qemu_icount_bias
, TimersState
),
531 VMSTATE_INT64(qemu_icount
, TimersState
),
532 VMSTATE_END_OF_LIST()
536 static const VMStateDescription vmstate_timers
= {
539 .minimum_version_id
= 1,
540 .fields
= (VMStateField
[]) {
541 VMSTATE_INT64(cpu_ticks_offset
, TimersState
),
542 VMSTATE_INT64(dummy
, TimersState
),
543 VMSTATE_INT64_V(cpu_clock_offset
, TimersState
, 2),
544 VMSTATE_END_OF_LIST()
546 .subsections
= (const VMStateDescription
*[]) {
547 &icount_vmstate_timers
,
552 static void cpu_throttle_thread(void *opaque
)
554 CPUState
*cpu
= opaque
;
556 double throttle_ratio
;
559 if (!cpu_throttle_get_percentage()) {
563 pct
= (double)cpu_throttle_get_percentage()/100;
564 throttle_ratio
= pct
/ (1 - pct
);
565 sleeptime_ns
= (long)(throttle_ratio
* CPU_THROTTLE_TIMESLICE_NS
);
567 qemu_mutex_unlock_iothread();
568 atomic_set(&cpu
->throttle_thread_scheduled
, 0);
569 g_usleep(sleeptime_ns
/ 1000); /* Convert ns to us for usleep call */
570 qemu_mutex_lock_iothread();
573 static void cpu_throttle_timer_tick(void *opaque
)
578 /* Stop the timer if needed */
579 if (!cpu_throttle_get_percentage()) {
583 if (!atomic_xchg(&cpu
->throttle_thread_scheduled
, 1)) {
584 async_run_on_cpu(cpu
, cpu_throttle_thread
, cpu
);
588 pct
= (double)cpu_throttle_get_percentage()/100;
589 timer_mod(throttle_timer
, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT
) +
590 CPU_THROTTLE_TIMESLICE_NS
/ (1-pct
));
593 void cpu_throttle_set(int new_throttle_pct
)
595 /* Ensure throttle percentage is within valid range */
596 new_throttle_pct
= MIN(new_throttle_pct
, CPU_THROTTLE_PCT_MAX
);
597 new_throttle_pct
= MAX(new_throttle_pct
, CPU_THROTTLE_PCT_MIN
);
599 atomic_set(&throttle_percentage
, new_throttle_pct
);
601 timer_mod(throttle_timer
, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT
) +
602 CPU_THROTTLE_TIMESLICE_NS
);
605 void cpu_throttle_stop(void)
607 atomic_set(&throttle_percentage
, 0);
610 bool cpu_throttle_active(void)
612 return (cpu_throttle_get_percentage() != 0);
615 int cpu_throttle_get_percentage(void)
617 return atomic_read(&throttle_percentage
);
620 void cpu_ticks_init(void)
622 seqlock_init(&timers_state
.vm_clock_seqlock
, NULL
);
623 vmstate_register(NULL
, 0, &vmstate_timers
, &timers_state
);
624 throttle_timer
= timer_new_ns(QEMU_CLOCK_VIRTUAL_RT
,
625 cpu_throttle_timer_tick
, NULL
);
628 void configure_icount(QemuOpts
*opts
, Error
**errp
)
631 char *rem_str
= NULL
;
633 option
= qemu_opt_get(opts
, "shift");
635 if (qemu_opt_get(opts
, "align") != NULL
) {
636 error_setg(errp
, "Please specify shift option when using align");
641 icount_sleep
= qemu_opt_get_bool(opts
, "sleep", true);
643 icount_warp_timer
= timer_new_ns(QEMU_CLOCK_VIRTUAL_RT
,
644 icount_timer_cb
, NULL
);
647 icount_align_option
= qemu_opt_get_bool(opts
, "align", false);
649 if (icount_align_option
&& !icount_sleep
) {
650 error_setg(errp
, "align=on and sleep=off are incompatible");
652 if (strcmp(option
, "auto") != 0) {
654 icount_time_shift
= strtol(option
, &rem_str
, 0);
655 if (errno
!= 0 || *rem_str
!= '\0' || !strlen(option
)) {
656 error_setg(errp
, "icount: Invalid shift value");
660 } else if (icount_align_option
) {
661 error_setg(errp
, "shift=auto and align=on are incompatible");
662 } else if (!icount_sleep
) {
663 error_setg(errp
, "shift=auto and sleep=off are incompatible");
668 /* 125MIPS seems a reasonable initial guess at the guest speed.
669 It will be corrected fairly quickly anyway. */
670 icount_time_shift
= 3;
672 /* Have both realtime and virtual time triggers for speed adjustment.
673 The realtime trigger catches emulated time passing too slowly,
674 the virtual time trigger catches emulated time passing too fast.
675 Realtime triggers occur even when idle, so use them less frequently
677 icount_rt_timer
= timer_new_ms(QEMU_CLOCK_VIRTUAL_RT
,
678 icount_adjust_rt
, NULL
);
679 timer_mod(icount_rt_timer
,
680 qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT
) + 1000);
681 icount_vm_timer
= timer_new_ns(QEMU_CLOCK_VIRTUAL
,
682 icount_adjust_vm
, NULL
);
683 timer_mod(icount_vm_timer
,
684 qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL
) +
685 NANOSECONDS_PER_SECOND
/ 10);
688 /***********************************************************/
689 void hw_error(const char *fmt
, ...)
695 fprintf(stderr
, "qemu: hardware error: ");
696 vfprintf(stderr
, fmt
, ap
);
697 fprintf(stderr
, "\n");
699 fprintf(stderr
, "CPU #%d:\n", cpu
->cpu_index
);
700 cpu_dump_state(cpu
, stderr
, fprintf
, CPU_DUMP_FPU
);
706 void cpu_synchronize_all_states(void)
711 cpu_synchronize_state(cpu
);
715 void cpu_synchronize_all_post_reset(void)
720 cpu_synchronize_post_reset(cpu
);
724 void cpu_synchronize_all_post_init(void)
729 cpu_synchronize_post_init(cpu
);
733 static int do_vm_stop(RunState state
)
737 if (runstate_is_running()) {
741 vm_state_notify(0, state
);
742 qapi_event_send_stop(&error_abort
);
746 ret
= blk_flush_all();
751 static bool cpu_can_run(CPUState
*cpu
)
756 if (cpu_is_stopped(cpu
)) {
762 static void cpu_handle_guest_debug(CPUState
*cpu
)
764 gdb_set_stop_cpu(cpu
);
765 qemu_system_debug_request();
770 static void sigbus_reraise(void)
773 struct sigaction action
;
775 memset(&action
, 0, sizeof(action
));
776 action
.sa_handler
= SIG_DFL
;
777 if (!sigaction(SIGBUS
, &action
, NULL
)) {
780 sigaddset(&set
, SIGBUS
);
781 sigprocmask(SIG_UNBLOCK
, &set
, NULL
);
783 perror("Failed to re-raise SIGBUS!\n");
787 static void sigbus_handler(int n
, struct qemu_signalfd_siginfo
*siginfo
,
790 if (kvm_on_sigbus(siginfo
->ssi_code
,
791 (void *)(intptr_t)siginfo
->ssi_addr
)) {
796 static void qemu_init_sigbus(void)
798 struct sigaction action
;
800 memset(&action
, 0, sizeof(action
));
801 action
.sa_flags
= SA_SIGINFO
;
802 action
.sa_sigaction
= (void (*)(int, siginfo_t
*, void*))sigbus_handler
;
803 sigaction(SIGBUS
, &action
, NULL
);
805 prctl(PR_MCE_KILL
, PR_MCE_KILL_SET
, PR_MCE_KILL_EARLY
, 0, 0);
808 static void qemu_kvm_eat_signals(CPUState
*cpu
)
810 struct timespec ts
= { 0, 0 };
816 sigemptyset(&waitset
);
817 sigaddset(&waitset
, SIG_IPI
);
818 sigaddset(&waitset
, SIGBUS
);
821 r
= sigtimedwait(&waitset
, &siginfo
, &ts
);
822 if (r
== -1 && !(errno
== EAGAIN
|| errno
== EINTR
)) {
823 perror("sigtimedwait");
829 if (kvm_on_sigbus_vcpu(cpu
, siginfo
.si_code
, siginfo
.si_addr
)) {
837 r
= sigpending(&chkset
);
839 perror("sigpending");
842 } while (sigismember(&chkset
, SIG_IPI
) || sigismember(&chkset
, SIGBUS
));
845 #else /* !CONFIG_LINUX */
847 static void qemu_init_sigbus(void)
851 static void qemu_kvm_eat_signals(CPUState
*cpu
)
854 #endif /* !CONFIG_LINUX */
857 static void dummy_signal(int sig
)
861 static void qemu_kvm_init_cpu_signals(CPUState
*cpu
)
865 struct sigaction sigact
;
867 memset(&sigact
, 0, sizeof(sigact
));
868 sigact
.sa_handler
= dummy_signal
;
869 sigaction(SIG_IPI
, &sigact
, NULL
);
871 pthread_sigmask(SIG_BLOCK
, NULL
, &set
);
872 sigdelset(&set
, SIG_IPI
);
873 sigdelset(&set
, SIGBUS
);
874 r
= kvm_set_signal_mask(cpu
, &set
);
876 fprintf(stderr
, "kvm_set_signal_mask: %s\n", strerror(-r
));
882 static void qemu_kvm_init_cpu_signals(CPUState
*cpu
)
888 static QemuMutex qemu_global_mutex
;
889 static QemuCond qemu_io_proceeded_cond
;
890 static unsigned iothread_requesting_mutex
;
892 static QemuThread io_thread
;
895 static QemuCond qemu_cpu_cond
;
897 static QemuCond qemu_pause_cond
;
898 static QemuCond qemu_work_cond
;
900 void qemu_init_cpu_loop(void)
903 qemu_cond_init(&qemu_cpu_cond
);
904 qemu_cond_init(&qemu_pause_cond
);
905 qemu_cond_init(&qemu_work_cond
);
906 qemu_cond_init(&qemu_io_proceeded_cond
);
907 qemu_mutex_init(&qemu_global_mutex
);
909 qemu_thread_get_self(&io_thread
);
912 void run_on_cpu(CPUState
*cpu
, void (*func
)(void *data
), void *data
)
914 struct qemu_work_item wi
;
916 if (qemu_cpu_is_self(cpu
)) {
925 qemu_mutex_lock(&cpu
->work_mutex
);
926 if (cpu
->queued_work_first
== NULL
) {
927 cpu
->queued_work_first
= &wi
;
929 cpu
->queued_work_last
->next
= &wi
;
931 cpu
->queued_work_last
= &wi
;
934 qemu_mutex_unlock(&cpu
->work_mutex
);
937 while (!atomic_mb_read(&wi
.done
)) {
938 CPUState
*self_cpu
= current_cpu
;
940 qemu_cond_wait(&qemu_work_cond
, &qemu_global_mutex
);
941 current_cpu
= self_cpu
;
945 void async_run_on_cpu(CPUState
*cpu
, void (*func
)(void *data
), void *data
)
947 struct qemu_work_item
*wi
;
949 if (qemu_cpu_is_self(cpu
)) {
954 wi
= g_malloc0(sizeof(struct qemu_work_item
));
959 qemu_mutex_lock(&cpu
->work_mutex
);
960 if (cpu
->queued_work_first
== NULL
) {
961 cpu
->queued_work_first
= wi
;
963 cpu
->queued_work_last
->next
= wi
;
965 cpu
->queued_work_last
= wi
;
968 qemu_mutex_unlock(&cpu
->work_mutex
);
973 static void flush_queued_work(CPUState
*cpu
)
975 struct qemu_work_item
*wi
;
977 if (cpu
->queued_work_first
== NULL
) {
981 qemu_mutex_lock(&cpu
->work_mutex
);
982 while (cpu
->queued_work_first
!= NULL
) {
983 wi
= cpu
->queued_work_first
;
984 cpu
->queued_work_first
= wi
->next
;
985 if (!cpu
->queued_work_first
) {
986 cpu
->queued_work_last
= NULL
;
988 qemu_mutex_unlock(&cpu
->work_mutex
);
990 qemu_mutex_lock(&cpu
->work_mutex
);
994 atomic_mb_set(&wi
->done
, true);
997 qemu_mutex_unlock(&cpu
->work_mutex
);
998 qemu_cond_broadcast(&qemu_work_cond
);
1001 static void qemu_wait_io_event_common(CPUState
*cpu
)
1005 cpu
->stopped
= true;
1006 qemu_cond_broadcast(&qemu_pause_cond
);
1008 flush_queued_work(cpu
);
1009 cpu
->thread_kicked
= false;
1012 static void qemu_tcg_wait_io_event(CPUState
*cpu
)
1014 while (all_cpu_threads_idle()) {
1015 qemu_cond_wait(cpu
->halt_cond
, &qemu_global_mutex
);
1018 while (iothread_requesting_mutex
) {
1019 qemu_cond_wait(&qemu_io_proceeded_cond
, &qemu_global_mutex
);
1023 qemu_wait_io_event_common(cpu
);
1027 static void qemu_kvm_wait_io_event(CPUState
*cpu
)
1029 while (cpu_thread_is_idle(cpu
)) {
1030 qemu_cond_wait(cpu
->halt_cond
, &qemu_global_mutex
);
1033 qemu_kvm_eat_signals(cpu
);
1034 qemu_wait_io_event_common(cpu
);
1037 static void *qemu_kvm_cpu_thread_fn(void *arg
)
1039 CPUState
*cpu
= arg
;
1042 rcu_register_thread();
1044 qemu_mutex_lock_iothread();
1045 qemu_thread_get_self(cpu
->thread
);
1046 cpu
->thread_id
= qemu_get_thread_id();
1050 r
= kvm_init_vcpu(cpu
);
1052 fprintf(stderr
, "kvm_init_vcpu failed: %s\n", strerror(-r
));
1056 qemu_kvm_init_cpu_signals(cpu
);
1058 /* signal CPU creation */
1059 cpu
->created
= true;
1060 qemu_cond_signal(&qemu_cpu_cond
);
1063 if (cpu_can_run(cpu
)) {
1064 r
= kvm_cpu_exec(cpu
);
1065 if (r
== EXCP_DEBUG
) {
1066 cpu_handle_guest_debug(cpu
);
1069 qemu_kvm_wait_io_event(cpu
);
1075 static void *qemu_dummy_cpu_thread_fn(void *arg
)
1078 fprintf(stderr
, "qtest is not supported under Windows\n");
1081 CPUState
*cpu
= arg
;
1085 rcu_register_thread();
1087 qemu_mutex_lock_iothread();
1088 qemu_thread_get_self(cpu
->thread
);
1089 cpu
->thread_id
= qemu_get_thread_id();
1092 sigemptyset(&waitset
);
1093 sigaddset(&waitset
, SIG_IPI
);
1095 /* signal CPU creation */
1096 cpu
->created
= true;
1097 qemu_cond_signal(&qemu_cpu_cond
);
1102 qemu_mutex_unlock_iothread();
1105 r
= sigwait(&waitset
, &sig
);
1106 } while (r
== -1 && (errno
== EAGAIN
|| errno
== EINTR
));
1111 qemu_mutex_lock_iothread();
1113 qemu_wait_io_event_common(cpu
);
1120 static void tcg_exec_all(void);
1122 static void *qemu_tcg_cpu_thread_fn(void *arg
)
1124 CPUState
*cpu
= arg
;
1126 rcu_register_thread();
1128 qemu_mutex_lock_iothread();
1129 qemu_thread_get_self(cpu
->thread
);
1132 cpu
->thread_id
= qemu_get_thread_id();
1133 cpu
->created
= true;
1136 qemu_cond_signal(&qemu_cpu_cond
);
1138 /* wait for initial kick-off after machine start */
1139 while (first_cpu
->stopped
) {
1140 qemu_cond_wait(first_cpu
->halt_cond
, &qemu_global_mutex
);
1142 /* process any pending work */
1144 qemu_wait_io_event_common(cpu
);
1148 /* process any pending work */
1149 atomic_mb_set(&exit_request
, 1);
1155 int64_t deadline
= qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL
);
1157 if (deadline
== 0) {
1158 qemu_clock_notify(QEMU_CLOCK_VIRTUAL
);
1161 qemu_tcg_wait_io_event(QTAILQ_FIRST(&cpus
));
1167 static void qemu_cpu_kick_thread(CPUState
*cpu
)
1172 if (cpu
->thread_kicked
) {
1175 cpu
->thread_kicked
= true;
1176 err
= pthread_kill(cpu
->thread
->thread
, SIG_IPI
);
1178 fprintf(stderr
, "qemu:%s: %s", __func__
, strerror(err
));
1186 static void qemu_cpu_kick_no_halt(void)
1189 /* Ensure whatever caused the exit has reached the CPU threads before
1190 * writing exit_request.
1192 atomic_mb_set(&exit_request
, 1);
1193 cpu
= atomic_mb_read(&tcg_current_cpu
);
1199 void qemu_cpu_kick(CPUState
*cpu
)
1201 qemu_cond_broadcast(cpu
->halt_cond
);
1202 if (tcg_enabled()) {
1203 qemu_cpu_kick_no_halt();
1205 qemu_cpu_kick_thread(cpu
);
1209 void qemu_cpu_kick_self(void)
1211 assert(current_cpu
);
1212 qemu_cpu_kick_thread(current_cpu
);
1215 bool qemu_cpu_is_self(CPUState
*cpu
)
1217 return qemu_thread_is_self(cpu
->thread
);
1220 bool qemu_in_vcpu_thread(void)
1222 return current_cpu
&& qemu_cpu_is_self(current_cpu
);
1225 static __thread
bool iothread_locked
= false;
1227 bool qemu_mutex_iothread_locked(void)
1229 return iothread_locked
;
1232 void qemu_mutex_lock_iothread(void)
1234 atomic_inc(&iothread_requesting_mutex
);
1235 /* In the simple case there is no need to bump the VCPU thread out of
1236 * TCG code execution.
1238 if (!tcg_enabled() || qemu_in_vcpu_thread() ||
1239 !first_cpu
|| !first_cpu
->created
) {
1240 qemu_mutex_lock(&qemu_global_mutex
);
1241 atomic_dec(&iothread_requesting_mutex
);
1243 if (qemu_mutex_trylock(&qemu_global_mutex
)) {
1244 qemu_cpu_kick_no_halt();
1245 qemu_mutex_lock(&qemu_global_mutex
);
1247 atomic_dec(&iothread_requesting_mutex
);
1248 qemu_cond_broadcast(&qemu_io_proceeded_cond
);
1250 iothread_locked
= true;
1253 void qemu_mutex_unlock_iothread(void)
1255 iothread_locked
= false;
1256 qemu_mutex_unlock(&qemu_global_mutex
);
1259 static int all_vcpus_paused(void)
1264 if (!cpu
->stopped
) {
1272 void pause_all_vcpus(void)
1276 qemu_clock_enable(QEMU_CLOCK_VIRTUAL
, false);
1282 if (qemu_in_vcpu_thread()) {
1284 if (!kvm_enabled()) {
1287 cpu
->stopped
= true;
1293 while (!all_vcpus_paused()) {
1294 qemu_cond_wait(&qemu_pause_cond
, &qemu_global_mutex
);
1301 void cpu_resume(CPUState
*cpu
)
1304 cpu
->stopped
= false;
1308 void resume_all_vcpus(void)
1312 qemu_clock_enable(QEMU_CLOCK_VIRTUAL
, true);
1318 /* For temporary buffers for forming a name */
1319 #define VCPU_THREAD_NAME_SIZE 16
1321 static void qemu_tcg_init_vcpu(CPUState
*cpu
)
1323 char thread_name
[VCPU_THREAD_NAME_SIZE
];
1324 static QemuCond
*tcg_halt_cond
;
1325 static QemuThread
*tcg_cpu_thread
;
1327 /* share a single thread for all cpus with TCG */
1328 if (!tcg_cpu_thread
) {
1329 cpu
->thread
= g_malloc0(sizeof(QemuThread
));
1330 cpu
->halt_cond
= g_malloc0(sizeof(QemuCond
));
1331 qemu_cond_init(cpu
->halt_cond
);
1332 tcg_halt_cond
= cpu
->halt_cond
;
1333 snprintf(thread_name
, VCPU_THREAD_NAME_SIZE
, "CPU %d/TCG",
1335 qemu_thread_create(cpu
->thread
, thread_name
, qemu_tcg_cpu_thread_fn
,
1336 cpu
, QEMU_THREAD_JOINABLE
);
1338 cpu
->hThread
= qemu_thread_get_handle(cpu
->thread
);
1340 while (!cpu
->created
) {
1341 qemu_cond_wait(&qemu_cpu_cond
, &qemu_global_mutex
);
1343 tcg_cpu_thread
= cpu
->thread
;
1345 cpu
->thread
= tcg_cpu_thread
;
1346 cpu
->halt_cond
= tcg_halt_cond
;
1350 static void qemu_kvm_start_vcpu(CPUState
*cpu
)
1352 char thread_name
[VCPU_THREAD_NAME_SIZE
];
1354 cpu
->thread
= g_malloc0(sizeof(QemuThread
));
1355 cpu
->halt_cond
= g_malloc0(sizeof(QemuCond
));
1356 qemu_cond_init(cpu
->halt_cond
);
1357 snprintf(thread_name
, VCPU_THREAD_NAME_SIZE
, "CPU %d/KVM",
1359 qemu_thread_create(cpu
->thread
, thread_name
, qemu_kvm_cpu_thread_fn
,
1360 cpu
, QEMU_THREAD_JOINABLE
);
1361 while (!cpu
->created
) {
1362 qemu_cond_wait(&qemu_cpu_cond
, &qemu_global_mutex
);
1366 static void qemu_dummy_start_vcpu(CPUState
*cpu
)
1368 char thread_name
[VCPU_THREAD_NAME_SIZE
];
1370 cpu
->thread
= g_malloc0(sizeof(QemuThread
));
1371 cpu
->halt_cond
= g_malloc0(sizeof(QemuCond
));
1372 qemu_cond_init(cpu
->halt_cond
);
1373 snprintf(thread_name
, VCPU_THREAD_NAME_SIZE
, "CPU %d/DUMMY",
1375 qemu_thread_create(cpu
->thread
, thread_name
, qemu_dummy_cpu_thread_fn
, cpu
,
1376 QEMU_THREAD_JOINABLE
);
1377 while (!cpu
->created
) {
1378 qemu_cond_wait(&qemu_cpu_cond
, &qemu_global_mutex
);
1382 void qemu_init_vcpu(CPUState
*cpu
)
1384 cpu
->nr_cores
= smp_cores
;
1385 cpu
->nr_threads
= smp_threads
;
1386 cpu
->stopped
= true;
1389 /* If the target cpu hasn't set up any address spaces itself,
1390 * give it the default one.
1392 AddressSpace
*as
= address_space_init_shareable(cpu
->memory
,
1395 cpu_address_space_init(cpu
, as
, 0);
1398 if (kvm_enabled()) {
1399 qemu_kvm_start_vcpu(cpu
);
1400 } else if (tcg_enabled()) {
1401 qemu_tcg_init_vcpu(cpu
);
1403 qemu_dummy_start_vcpu(cpu
);
1407 void cpu_stop_current(void)
1410 current_cpu
->stop
= false;
1411 current_cpu
->stopped
= true;
1412 cpu_exit(current_cpu
);
1413 qemu_cond_broadcast(&qemu_pause_cond
);
1417 int vm_stop(RunState state
)
1419 if (qemu_in_vcpu_thread()) {
1420 qemu_system_vmstop_request_prepare();
1421 qemu_system_vmstop_request(state
);
1423 * FIXME: should not return to device code in case
1424 * vm_stop() has been requested.
1430 return do_vm_stop(state
);
1433 /* does a state transition even if the VM is already stopped,
1434 current state is forgotten forever */
1435 int vm_stop_force_state(RunState state
)
1437 if (runstate_is_running()) {
1438 return vm_stop(state
);
1440 runstate_set(state
);
1443 /* Make sure to return an error if the flush in a previous vm_stop()
1445 return blk_flush_all();
1449 static int64_t tcg_get_icount_limit(void)
1453 if (replay_mode
!= REPLAY_MODE_PLAY
) {
1454 deadline
= qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL
);
1456 /* Maintain prior (possibly buggy) behaviour where if no deadline
1457 * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
1458 * INT32_MAX nanoseconds ahead, we still use INT32_MAX
1461 if ((deadline
< 0) || (deadline
> INT32_MAX
)) {
1462 deadline
= INT32_MAX
;
1465 return qemu_icount_round(deadline
);
1467 return replay_get_instructions();
1471 static int tcg_cpu_exec(CPUState
*cpu
)
1474 #ifdef CONFIG_PROFILER
1478 #ifdef CONFIG_PROFILER
1479 ti
= profile_getclock();
1484 timers_state
.qemu_icount
-= (cpu
->icount_decr
.u16
.low
1485 + cpu
->icount_extra
);
1486 cpu
->icount_decr
.u16
.low
= 0;
1487 cpu
->icount_extra
= 0;
1488 count
= tcg_get_icount_limit();
1489 timers_state
.qemu_icount
+= count
;
1490 decr
= (count
> 0xffff) ? 0xffff : count
;
1492 cpu
->icount_decr
.u16
.low
= decr
;
1493 cpu
->icount_extra
= count
;
1495 ret
= cpu_exec(cpu
);
1496 #ifdef CONFIG_PROFILER
1497 tcg_time
+= profile_getclock() - ti
;
1500 /* Fold pending instructions back into the
1501 instruction counter, and clear the interrupt flag. */
1502 timers_state
.qemu_icount
-= (cpu
->icount_decr
.u16
.low
1503 + cpu
->icount_extra
);
1504 cpu
->icount_decr
.u32
= 0;
1505 cpu
->icount_extra
= 0;
1506 replay_account_executed_instructions();
1511 static void tcg_exec_all(void)
1515 /* Account partial waits to QEMU_CLOCK_VIRTUAL. */
1516 qemu_account_warp_timer();
1518 if (next_cpu
== NULL
) {
1519 next_cpu
= first_cpu
;
1521 for (; next_cpu
!= NULL
&& !exit_request
; next_cpu
= CPU_NEXT(next_cpu
)) {
1522 CPUState
*cpu
= next_cpu
;
1524 qemu_clock_enable(QEMU_CLOCK_VIRTUAL
,
1525 (cpu
->singlestep_enabled
& SSTEP_NOTIMER
) == 0);
1527 if (cpu_can_run(cpu
)) {
1528 r
= tcg_cpu_exec(cpu
);
1529 if (r
== EXCP_DEBUG
) {
1530 cpu_handle_guest_debug(cpu
);
1533 } else if (cpu
->stop
|| cpu
->stopped
) {
1538 /* Pairs with smp_wmb in qemu_cpu_kick. */
1539 atomic_mb_set(&exit_request
, 0);
1542 void list_cpus(FILE *f
, fprintf_function cpu_fprintf
, const char *optarg
)
1544 /* XXX: implement xxx_cpu_list for targets that still miss it */
1545 #if defined(cpu_list)
1546 cpu_list(f
, cpu_fprintf
);
1550 CpuInfoList
*qmp_query_cpus(Error
**errp
)
1552 CpuInfoList
*head
= NULL
, *cur_item
= NULL
;
1557 #if defined(TARGET_I386)
1558 X86CPU
*x86_cpu
= X86_CPU(cpu
);
1559 CPUX86State
*env
= &x86_cpu
->env
;
1560 #elif defined(TARGET_PPC)
1561 PowerPCCPU
*ppc_cpu
= POWERPC_CPU(cpu
);
1562 CPUPPCState
*env
= &ppc_cpu
->env
;
1563 #elif defined(TARGET_SPARC)
1564 SPARCCPU
*sparc_cpu
= SPARC_CPU(cpu
);
1565 CPUSPARCState
*env
= &sparc_cpu
->env
;
1566 #elif defined(TARGET_MIPS)
1567 MIPSCPU
*mips_cpu
= MIPS_CPU(cpu
);
1568 CPUMIPSState
*env
= &mips_cpu
->env
;
1569 #elif defined(TARGET_TRICORE)
1570 TriCoreCPU
*tricore_cpu
= TRICORE_CPU(cpu
);
1571 CPUTriCoreState
*env
= &tricore_cpu
->env
;
1574 cpu_synchronize_state(cpu
);
1576 info
= g_malloc0(sizeof(*info
));
1577 info
->value
= g_malloc0(sizeof(*info
->value
));
1578 info
->value
->CPU
= cpu
->cpu_index
;
1579 info
->value
->current
= (cpu
== first_cpu
);
1580 info
->value
->halted
= cpu
->halted
;
1581 info
->value
->qom_path
= object_get_canonical_path(OBJECT(cpu
));
1582 info
->value
->thread_id
= cpu
->thread_id
;
1583 #if defined(TARGET_I386)
1584 info
->value
->arch
= CPU_INFO_ARCH_X86
;
1585 info
->value
->u
.x86
.pc
= env
->eip
+ env
->segs
[R_CS
].base
;
1586 #elif defined(TARGET_PPC)
1587 info
->value
->arch
= CPU_INFO_ARCH_PPC
;
1588 info
->value
->u
.ppc
.nip
= env
->nip
;
1589 #elif defined(TARGET_SPARC)
1590 info
->value
->arch
= CPU_INFO_ARCH_SPARC
;
1591 info
->value
->u
.q_sparc
.pc
= env
->pc
;
1592 info
->value
->u
.q_sparc
.npc
= env
->npc
;
1593 #elif defined(TARGET_MIPS)
1594 info
->value
->arch
= CPU_INFO_ARCH_MIPS
;
1595 info
->value
->u
.q_mips
.PC
= env
->active_tc
.PC
;
1596 #elif defined(TARGET_TRICORE)
1597 info
->value
->arch
= CPU_INFO_ARCH_TRICORE
;
1598 info
->value
->u
.tricore
.PC
= env
->PC
;
1600 info
->value
->arch
= CPU_INFO_ARCH_OTHER
;
1603 /* XXX: waiting for the qapi to support GSList */
1605 head
= cur_item
= info
;
1607 cur_item
->next
= info
;
1615 void qmp_memsave(int64_t addr
, int64_t size
, const char *filename
,
1616 bool has_cpu
, int64_t cpu_index
, Error
**errp
)
1622 int64_t orig_addr
= addr
, orig_size
= size
;
1628 cpu
= qemu_get_cpu(cpu_index
);
1630 error_setg(errp
, QERR_INVALID_PARAMETER_VALUE
, "cpu-index",
1635 f
= fopen(filename
, "wb");
1637 error_setg_file_open(errp
, errno
, filename
);
1645 if (cpu_memory_rw_debug(cpu
, addr
, buf
, l
, 0) != 0) {
1646 error_setg(errp
, "Invalid addr 0x%016" PRIx64
"/size %" PRId64
1647 " specified", orig_addr
, orig_size
);
1650 if (fwrite(buf
, 1, l
, f
) != l
) {
1651 error_setg(errp
, QERR_IO_ERROR
);
1662 void qmp_pmemsave(int64_t addr
, int64_t size
, const char *filename
,
1669 f
= fopen(filename
, "wb");
1671 error_setg_file_open(errp
, errno
, filename
);
1679 cpu_physical_memory_read(addr
, buf
, l
);
1680 if (fwrite(buf
, 1, l
, f
) != l
) {
1681 error_setg(errp
, QERR_IO_ERROR
);
1692 void qmp_inject_nmi(Error
**errp
)
1694 #if defined(TARGET_I386)
1698 X86CPU
*cpu
= X86_CPU(cs
);
1700 if (!cpu
->apic_state
) {
1701 cpu_interrupt(cs
, CPU_INTERRUPT_NMI
);
1703 apic_deliver_nmi(cpu
->apic_state
);
1707 nmi_monitor_handle(monitor_get_cpu_index(), errp
);
1711 void dump_drift_info(FILE *f
, fprintf_function cpu_fprintf
)
1717 cpu_fprintf(f
, "Host - Guest clock %"PRIi64
" ms\n",
1718 (cpu_get_clock() - cpu_get_icount())/SCALE_MS
);
1719 if (icount_align_option
) {
1720 cpu_fprintf(f
, "Max guest delay %"PRIi64
" ms\n", -max_delay
/SCALE_MS
);
1721 cpu_fprintf(f
, "Max guest advance %"PRIi64
" ms\n", max_advance
/SCALE_MS
);
1723 cpu_fprintf(f
, "Max guest delay NA\n");
1724 cpu_fprintf(f
, "Max guest advance NA\n");