4 * Copyright (c) 2003-2008 Fabrice Bellard
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
25 #include "qemu/osdep.h"
26 #include "qemu-common.h"
27 #include "qemu/config-file.h"
28 #include "qemu/cutils.h"
29 #include "migration/vmstate.h"
30 #include "monitor/monitor.h"
31 #include "qapi/error.h"
32 #include "qapi/qapi-commands-misc.h"
33 #include "qapi/qapi-events-run-state.h"
34 #include "qapi/qmp/qerror.h"
35 #include "qemu/error-report.h"
36 #include "qemu/qemu-print.h"
37 #include "sysemu/tcg.h"
38 #include "sysemu/block-backend.h"
39 #include "exec/gdbstub.h"
40 #include "sysemu/dma.h"
41 #include "sysemu/hw_accel.h"
42 #include "sysemu/kvm.h"
43 #include "sysemu/hax.h"
44 #include "sysemu/hvf.h"
45 #include "sysemu/whpx.h"
46 #include "exec/exec-all.h"
48 #include "qemu/thread.h"
49 #include "qemu/plugin.h"
50 #include "sysemu/cpus.h"
51 #include "sysemu/qtest.h"
52 #include "qemu/main-loop.h"
53 #include "qemu/option.h"
54 #include "qemu/bitmap.h"
55 #include "qemu/seqlock.h"
56 #include "qemu/guest-random.h"
59 #include "sysemu/replay.h"
60 #include "sysemu/runstate.h"
61 #include "hw/boards.h"
66 #include <sys/prctl.h>
69 #define PR_MCE_KILL 33
72 #ifndef PR_MCE_KILL_SET
73 #define PR_MCE_KILL_SET 1
76 #ifndef PR_MCE_KILL_EARLY
77 #define PR_MCE_KILL_EARLY 1
80 #endif /* CONFIG_LINUX */
82 static QemuMutex qemu_global_mutex
;
87 /* vcpu throttling controls */
88 static QEMUTimer
*throttle_timer
;
89 static unsigned int throttle_percentage
;
91 #define CPU_THROTTLE_PCT_MIN 1
92 #define CPU_THROTTLE_PCT_MAX 99
93 #define CPU_THROTTLE_TIMESLICE_NS 10000000
95 bool cpu_is_stopped(CPUState
*cpu
)
97 return cpu
->stopped
|| !runstate_is_running();
100 static bool cpu_thread_is_idle(CPUState
*cpu
)
102 if (cpu
->stop
|| cpu
->queued_work_first
) {
105 if (cpu_is_stopped(cpu
)) {
108 if (!cpu
->halted
|| cpu_has_work(cpu
) ||
109 kvm_halt_in_kernel()) {
115 static bool all_cpu_threads_idle(void)
120 if (!cpu_thread_is_idle(cpu
)) {
127 /***********************************************************/
128 /* guest cycle counter */
130 /* Protected by TimersState seqlock */
132 static bool icount_sleep
= true;
133 /* Arbitrarily pick 1MIPS as the minimum allowable speed. */
134 #define MAX_ICOUNT_SHIFT 10
136 typedef struct TimersState
{
137 /* Protected by BQL. */
138 int64_t cpu_ticks_prev
;
139 int64_t cpu_ticks_offset
;
141 /* Protect fields that can be respectively read outside the
142 * BQL, and written from multiple threads.
144 QemuSeqLock vm_clock_seqlock
;
145 QemuSpin vm_clock_lock
;
147 int16_t cpu_ticks_enabled
;
149 /* Conversion factor from emulated instructions to virtual clock ticks. */
150 int16_t icount_time_shift
;
152 /* Compensate for varying guest execution speed. */
153 int64_t qemu_icount_bias
;
155 int64_t vm_clock_warp_start
;
156 int64_t cpu_clock_offset
;
158 /* Only written by TCG thread */
161 /* for adjusting icount */
162 QEMUTimer
*icount_rt_timer
;
163 QEMUTimer
*icount_vm_timer
;
164 QEMUTimer
*icount_warp_timer
;
167 static TimersState timers_state
;
171 /* The current number of executed instructions is based on what we
172 * originally budgeted minus the current state of the decrementing
173 * icount counters in extra/u16.low.
175 static int64_t cpu_get_icount_executed(CPUState
*cpu
)
177 return (cpu
->icount_budget
-
178 (cpu_neg(cpu
)->icount_decr
.u16
.low
+ cpu
->icount_extra
));
182 * Update the global shared timer_state.qemu_icount to take into
183 * account executed instructions. This is done by the TCG vCPU
184 * thread so the main-loop can see time has moved forward.
186 static void cpu_update_icount_locked(CPUState
*cpu
)
188 int64_t executed
= cpu_get_icount_executed(cpu
);
189 cpu
->icount_budget
-= executed
;
191 atomic_set_i64(&timers_state
.qemu_icount
,
192 timers_state
.qemu_icount
+ executed
);
196 * Update the global shared timer_state.qemu_icount to take into
197 * account executed instructions. This is done by the TCG vCPU
198 * thread so the main-loop can see time has moved forward.
200 void cpu_update_icount(CPUState
*cpu
)
202 seqlock_write_lock(&timers_state
.vm_clock_seqlock
,
203 &timers_state
.vm_clock_lock
);
204 cpu_update_icount_locked(cpu
);
205 seqlock_write_unlock(&timers_state
.vm_clock_seqlock
,
206 &timers_state
.vm_clock_lock
);
209 static int64_t cpu_get_icount_raw_locked(void)
211 CPUState
*cpu
= current_cpu
;
213 if (cpu
&& cpu
->running
) {
214 if (!cpu
->can_do_io
) {
215 error_report("Bad icount read");
218 /* Take into account what has run */
219 cpu_update_icount_locked(cpu
);
221 /* The read is protected by the seqlock, but needs atomic64 to avoid UB */
222 return atomic_read_i64(&timers_state
.qemu_icount
);
225 static int64_t cpu_get_icount_locked(void)
227 int64_t icount
= cpu_get_icount_raw_locked();
228 return atomic_read_i64(&timers_state
.qemu_icount_bias
) +
229 cpu_icount_to_ns(icount
);
232 int64_t cpu_get_icount_raw(void)
238 start
= seqlock_read_begin(&timers_state
.vm_clock_seqlock
);
239 icount
= cpu_get_icount_raw_locked();
240 } while (seqlock_read_retry(&timers_state
.vm_clock_seqlock
, start
));
245 /* Return the virtual CPU time, based on the instruction counter. */
246 int64_t cpu_get_icount(void)
252 start
= seqlock_read_begin(&timers_state
.vm_clock_seqlock
);
253 icount
= cpu_get_icount_locked();
254 } while (seqlock_read_retry(&timers_state
.vm_clock_seqlock
, start
));
259 int64_t cpu_icount_to_ns(int64_t icount
)
261 return icount
<< atomic_read(&timers_state
.icount_time_shift
);
264 static int64_t cpu_get_ticks_locked(void)
266 int64_t ticks
= timers_state
.cpu_ticks_offset
;
267 if (timers_state
.cpu_ticks_enabled
) {
268 ticks
+= cpu_get_host_ticks();
271 if (timers_state
.cpu_ticks_prev
> ticks
) {
272 /* Non increasing ticks may happen if the host uses software suspend. */
273 timers_state
.cpu_ticks_offset
+= timers_state
.cpu_ticks_prev
- ticks
;
274 ticks
= timers_state
.cpu_ticks_prev
;
277 timers_state
.cpu_ticks_prev
= ticks
;
281 /* return the time elapsed in VM between vm_start and vm_stop. Unless
282 * icount is active, cpu_get_ticks() uses units of the host CPU cycle
285 int64_t cpu_get_ticks(void)
290 return cpu_get_icount();
293 qemu_spin_lock(&timers_state
.vm_clock_lock
);
294 ticks
= cpu_get_ticks_locked();
295 qemu_spin_unlock(&timers_state
.vm_clock_lock
);
299 static int64_t cpu_get_clock_locked(void)
303 time
= timers_state
.cpu_clock_offset
;
304 if (timers_state
.cpu_ticks_enabled
) {
311 /* Return the monotonic time elapsed in VM, i.e.,
312 * the time between vm_start and vm_stop
314 int64_t cpu_get_clock(void)
320 start
= seqlock_read_begin(&timers_state
.vm_clock_seqlock
);
321 ti
= cpu_get_clock_locked();
322 } while (seqlock_read_retry(&timers_state
.vm_clock_seqlock
, start
));
327 /* enable cpu_get_ticks()
328 * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
330 void cpu_enable_ticks(void)
332 seqlock_write_lock(&timers_state
.vm_clock_seqlock
,
333 &timers_state
.vm_clock_lock
);
334 if (!timers_state
.cpu_ticks_enabled
) {
335 timers_state
.cpu_ticks_offset
-= cpu_get_host_ticks();
336 timers_state
.cpu_clock_offset
-= get_clock();
337 timers_state
.cpu_ticks_enabled
= 1;
339 seqlock_write_unlock(&timers_state
.vm_clock_seqlock
,
340 &timers_state
.vm_clock_lock
);
343 /* disable cpu_get_ticks() : the clock is stopped. You must not call
344 * cpu_get_ticks() after that.
345 * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
347 void cpu_disable_ticks(void)
349 seqlock_write_lock(&timers_state
.vm_clock_seqlock
,
350 &timers_state
.vm_clock_lock
);
351 if (timers_state
.cpu_ticks_enabled
) {
352 timers_state
.cpu_ticks_offset
+= cpu_get_host_ticks();
353 timers_state
.cpu_clock_offset
= cpu_get_clock_locked();
354 timers_state
.cpu_ticks_enabled
= 0;
356 seqlock_write_unlock(&timers_state
.vm_clock_seqlock
,
357 &timers_state
.vm_clock_lock
);
360 /* Correlation between real and virtual time is always going to be
361 fairly approximate, so ignore small variation.
362 When the guest is idle real and virtual time will be aligned in
364 #define ICOUNT_WOBBLE (NANOSECONDS_PER_SECOND / 10)
366 static void icount_adjust(void)
372 /* Protected by TimersState mutex. */
373 static int64_t last_delta
;
375 /* If the VM is not running, then do nothing. */
376 if (!runstate_is_running()) {
380 seqlock_write_lock(&timers_state
.vm_clock_seqlock
,
381 &timers_state
.vm_clock_lock
);
382 cur_time
= REPLAY_CLOCK_LOCKED(REPLAY_CLOCK_VIRTUAL_RT
,
383 cpu_get_clock_locked());
384 cur_icount
= cpu_get_icount_locked();
386 delta
= cur_icount
- cur_time
;
387 /* FIXME: This is a very crude algorithm, somewhat prone to oscillation. */
389 && last_delta
+ ICOUNT_WOBBLE
< delta
* 2
390 && timers_state
.icount_time_shift
> 0) {
391 /* The guest is getting too far ahead. Slow time down. */
392 atomic_set(&timers_state
.icount_time_shift
,
393 timers_state
.icount_time_shift
- 1);
396 && last_delta
- ICOUNT_WOBBLE
> delta
* 2
397 && timers_state
.icount_time_shift
< MAX_ICOUNT_SHIFT
) {
398 /* The guest is getting too far behind. Speed time up. */
399 atomic_set(&timers_state
.icount_time_shift
,
400 timers_state
.icount_time_shift
+ 1);
403 atomic_set_i64(&timers_state
.qemu_icount_bias
,
404 cur_icount
- (timers_state
.qemu_icount
405 << timers_state
.icount_time_shift
));
406 seqlock_write_unlock(&timers_state
.vm_clock_seqlock
,
407 &timers_state
.vm_clock_lock
);
410 static void icount_adjust_rt(void *opaque
)
412 timer_mod(timers_state
.icount_rt_timer
,
413 qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT
) + 1000);
417 static void icount_adjust_vm(void *opaque
)
419 timer_mod(timers_state
.icount_vm_timer
,
420 qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL
) +
421 NANOSECONDS_PER_SECOND
/ 10);
425 static int64_t qemu_icount_round(int64_t count
)
427 int shift
= atomic_read(&timers_state
.icount_time_shift
);
428 return (count
+ (1 << shift
) - 1) >> shift
;
431 static void icount_warp_rt(void)
436 /* The icount_warp_timer is rescheduled soon after vm_clock_warp_start
437 * changes from -1 to another value, so the race here is okay.
440 seq
= seqlock_read_begin(&timers_state
.vm_clock_seqlock
);
441 warp_start
= timers_state
.vm_clock_warp_start
;
442 } while (seqlock_read_retry(&timers_state
.vm_clock_seqlock
, seq
));
444 if (warp_start
== -1) {
448 seqlock_write_lock(&timers_state
.vm_clock_seqlock
,
449 &timers_state
.vm_clock_lock
);
450 if (runstate_is_running()) {
451 int64_t clock
= REPLAY_CLOCK_LOCKED(REPLAY_CLOCK_VIRTUAL_RT
,
452 cpu_get_clock_locked());
455 warp_delta
= clock
- timers_state
.vm_clock_warp_start
;
456 if (use_icount
== 2) {
458 * In adaptive mode, do not let QEMU_CLOCK_VIRTUAL run too
459 * far ahead of real time.
461 int64_t cur_icount
= cpu_get_icount_locked();
462 int64_t delta
= clock
- cur_icount
;
463 warp_delta
= MIN(warp_delta
, delta
);
465 atomic_set_i64(&timers_state
.qemu_icount_bias
,
466 timers_state
.qemu_icount_bias
+ warp_delta
);
468 timers_state
.vm_clock_warp_start
= -1;
469 seqlock_write_unlock(&timers_state
.vm_clock_seqlock
,
470 &timers_state
.vm_clock_lock
);
472 if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL
)) {
473 qemu_clock_notify(QEMU_CLOCK_VIRTUAL
);
477 static void icount_timer_cb(void *opaque
)
479 /* No need for a checkpoint because the timer already synchronizes
480 * with CHECKPOINT_CLOCK_VIRTUAL_RT.
485 void qtest_clock_warp(int64_t dest
)
487 int64_t clock
= qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL
);
488 AioContext
*aio_context
;
489 assert(qtest_enabled());
490 aio_context
= qemu_get_aio_context();
491 while (clock
< dest
) {
492 int64_t deadline
= qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL
,
493 QEMU_TIMER_ATTR_ALL
);
494 int64_t warp
= qemu_soonest_timeout(dest
- clock
, deadline
);
496 seqlock_write_lock(&timers_state
.vm_clock_seqlock
,
497 &timers_state
.vm_clock_lock
);
498 atomic_set_i64(&timers_state
.qemu_icount_bias
,
499 timers_state
.qemu_icount_bias
+ warp
);
500 seqlock_write_unlock(&timers_state
.vm_clock_seqlock
,
501 &timers_state
.vm_clock_lock
);
503 qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL
);
504 timerlist_run_timers(aio_context
->tlg
.tl
[QEMU_CLOCK_VIRTUAL
]);
505 clock
= qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL
);
507 qemu_clock_notify(QEMU_CLOCK_VIRTUAL
);
510 void qemu_start_warp_timer(void)
519 /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
520 * do not fire, so computing the deadline does not make sense.
522 if (!runstate_is_running()) {
526 if (replay_mode
!= REPLAY_MODE_PLAY
) {
527 if (!all_cpu_threads_idle()) {
531 if (qtest_enabled()) {
532 /* When testing, qtest commands advance icount. */
536 replay_checkpoint(CHECKPOINT_CLOCK_WARP_START
);
538 /* warp clock deterministically in record/replay mode */
539 if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_START
)) {
540 /* vCPU is sleeping and warp can't be started.
541 It is probably a race condition: notification sent
542 to vCPU was processed in advance and vCPU went to sleep.
543 Therefore we have to wake it up for doing someting. */
544 if (replay_has_checkpoint()) {
545 qemu_clock_notify(QEMU_CLOCK_VIRTUAL
);
551 /* We want to use the earliest deadline from ALL vm_clocks */
552 clock
= qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT
);
553 deadline
= qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL
,
554 ~QEMU_TIMER_ATTR_EXTERNAL
);
556 static bool notified
;
557 if (!icount_sleep
&& !notified
) {
558 warn_report("icount sleep disabled and no active timers");
566 * Ensure QEMU_CLOCK_VIRTUAL proceeds even when the virtual CPU goes to
567 * sleep. Otherwise, the CPU might be waiting for a future timer
568 * interrupt to wake it up, but the interrupt never comes because
569 * the vCPU isn't running any insns and thus doesn't advance the
570 * QEMU_CLOCK_VIRTUAL.
574 * We never let VCPUs sleep in no sleep icount mode.
575 * If there is a pending QEMU_CLOCK_VIRTUAL timer we just advance
576 * to the next QEMU_CLOCK_VIRTUAL event and notify it.
577 * It is useful when we want a deterministic execution time,
578 * isolated from host latencies.
580 seqlock_write_lock(&timers_state
.vm_clock_seqlock
,
581 &timers_state
.vm_clock_lock
);
582 atomic_set_i64(&timers_state
.qemu_icount_bias
,
583 timers_state
.qemu_icount_bias
+ deadline
);
584 seqlock_write_unlock(&timers_state
.vm_clock_seqlock
,
585 &timers_state
.vm_clock_lock
);
586 qemu_clock_notify(QEMU_CLOCK_VIRTUAL
);
589 * We do stop VCPUs and only advance QEMU_CLOCK_VIRTUAL after some
590 * "real" time, (related to the time left until the next event) has
591 * passed. The QEMU_CLOCK_VIRTUAL_RT clock will do this.
592 * This avoids that the warps are visible externally; for example,
593 * you will not be sending network packets continuously instead of
596 seqlock_write_lock(&timers_state
.vm_clock_seqlock
,
597 &timers_state
.vm_clock_lock
);
598 if (timers_state
.vm_clock_warp_start
== -1
599 || timers_state
.vm_clock_warp_start
> clock
) {
600 timers_state
.vm_clock_warp_start
= clock
;
602 seqlock_write_unlock(&timers_state
.vm_clock_seqlock
,
603 &timers_state
.vm_clock_lock
);
604 timer_mod_anticipate(timers_state
.icount_warp_timer
,
607 } else if (deadline
== 0) {
608 qemu_clock_notify(QEMU_CLOCK_VIRTUAL
);
612 static void qemu_account_warp_timer(void)
614 if (!use_icount
|| !icount_sleep
) {
618 /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
619 * do not fire, so computing the deadline does not make sense.
621 if (!runstate_is_running()) {
625 /* warp clock deterministically in record/replay mode */
626 if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_ACCOUNT
)) {
630 timer_del(timers_state
.icount_warp_timer
);
634 static bool icount_state_needed(void *opaque
)
639 static bool warp_timer_state_needed(void *opaque
)
641 TimersState
*s
= opaque
;
642 return s
->icount_warp_timer
!= NULL
;
645 static bool adjust_timers_state_needed(void *opaque
)
647 TimersState
*s
= opaque
;
648 return s
->icount_rt_timer
!= NULL
;
651 static bool shift_state_needed(void *opaque
)
653 return use_icount
== 2;
657 * Subsection for warp timer migration is optional, because may not be created
659 static const VMStateDescription icount_vmstate_warp_timer
= {
660 .name
= "timer/icount/warp_timer",
662 .minimum_version_id
= 1,
663 .needed
= warp_timer_state_needed
,
664 .fields
= (VMStateField
[]) {
665 VMSTATE_INT64(vm_clock_warp_start
, TimersState
),
666 VMSTATE_TIMER_PTR(icount_warp_timer
, TimersState
),
667 VMSTATE_END_OF_LIST()
671 static const VMStateDescription icount_vmstate_adjust_timers
= {
672 .name
= "timer/icount/timers",
674 .minimum_version_id
= 1,
675 .needed
= adjust_timers_state_needed
,
676 .fields
= (VMStateField
[]) {
677 VMSTATE_TIMER_PTR(icount_rt_timer
, TimersState
),
678 VMSTATE_TIMER_PTR(icount_vm_timer
, TimersState
),
679 VMSTATE_END_OF_LIST()
683 static const VMStateDescription icount_vmstate_shift
= {
684 .name
= "timer/icount/shift",
686 .minimum_version_id
= 1,
687 .needed
= shift_state_needed
,
688 .fields
= (VMStateField
[]) {
689 VMSTATE_INT16(icount_time_shift
, TimersState
),
690 VMSTATE_END_OF_LIST()
695 * This is a subsection for icount migration.
697 static const VMStateDescription icount_vmstate_timers
= {
698 .name
= "timer/icount",
700 .minimum_version_id
= 1,
701 .needed
= icount_state_needed
,
702 .fields
= (VMStateField
[]) {
703 VMSTATE_INT64(qemu_icount_bias
, TimersState
),
704 VMSTATE_INT64(qemu_icount
, TimersState
),
705 VMSTATE_END_OF_LIST()
707 .subsections
= (const VMStateDescription
*[]) {
708 &icount_vmstate_warp_timer
,
709 &icount_vmstate_adjust_timers
,
710 &icount_vmstate_shift
,
715 static const VMStateDescription vmstate_timers
= {
718 .minimum_version_id
= 1,
719 .fields
= (VMStateField
[]) {
720 VMSTATE_INT64(cpu_ticks_offset
, TimersState
),
722 VMSTATE_INT64_V(cpu_clock_offset
, TimersState
, 2),
723 VMSTATE_END_OF_LIST()
725 .subsections
= (const VMStateDescription
*[]) {
726 &icount_vmstate_timers
,
731 static void cpu_throttle_thread(CPUState
*cpu
, run_on_cpu_data opaque
)
734 double throttle_ratio
;
735 int64_t sleeptime_ns
, endtime_ns
;
737 if (!cpu_throttle_get_percentage()) {
741 pct
= (double)cpu_throttle_get_percentage()/100;
742 throttle_ratio
= pct
/ (1 - pct
);
743 /* Add 1ns to fix double's rounding error (like 0.9999999...) */
744 sleeptime_ns
= (int64_t)(throttle_ratio
* CPU_THROTTLE_TIMESLICE_NS
+ 1);
745 endtime_ns
= qemu_clock_get_ns(QEMU_CLOCK_REALTIME
) + sleeptime_ns
;
746 while (sleeptime_ns
> 0 && !cpu
->stop
) {
747 if (sleeptime_ns
> SCALE_MS
) {
748 qemu_cond_timedwait(cpu
->halt_cond
, &qemu_global_mutex
,
749 sleeptime_ns
/ SCALE_MS
);
751 qemu_mutex_unlock_iothread();
752 g_usleep(sleeptime_ns
/ SCALE_US
);
753 qemu_mutex_lock_iothread();
755 sleeptime_ns
= endtime_ns
- qemu_clock_get_ns(QEMU_CLOCK_REALTIME
);
757 atomic_set(&cpu
->throttle_thread_scheduled
, 0);
760 static void cpu_throttle_timer_tick(void *opaque
)
765 /* Stop the timer if needed */
766 if (!cpu_throttle_get_percentage()) {
770 if (!atomic_xchg(&cpu
->throttle_thread_scheduled
, 1)) {
771 async_run_on_cpu(cpu
, cpu_throttle_thread
,
776 pct
= (double)cpu_throttle_get_percentage()/100;
777 timer_mod(throttle_timer
, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT
) +
778 CPU_THROTTLE_TIMESLICE_NS
/ (1-pct
));
781 void cpu_throttle_set(int new_throttle_pct
)
783 /* Ensure throttle percentage is within valid range */
784 new_throttle_pct
= MIN(new_throttle_pct
, CPU_THROTTLE_PCT_MAX
);
785 new_throttle_pct
= MAX(new_throttle_pct
, CPU_THROTTLE_PCT_MIN
);
787 atomic_set(&throttle_percentage
, new_throttle_pct
);
789 timer_mod(throttle_timer
, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT
) +
790 CPU_THROTTLE_TIMESLICE_NS
);
793 void cpu_throttle_stop(void)
795 atomic_set(&throttle_percentage
, 0);
798 bool cpu_throttle_active(void)
800 return (cpu_throttle_get_percentage() != 0);
803 int cpu_throttle_get_percentage(void)
805 return atomic_read(&throttle_percentage
);
808 void cpu_ticks_init(void)
810 seqlock_init(&timers_state
.vm_clock_seqlock
);
811 qemu_spin_init(&timers_state
.vm_clock_lock
);
812 vmstate_register(NULL
, 0, &vmstate_timers
, &timers_state
);
813 throttle_timer
= timer_new_ns(QEMU_CLOCK_VIRTUAL_RT
,
814 cpu_throttle_timer_tick
, NULL
);
817 void configure_icount(QemuOpts
*opts
, Error
**errp
)
819 const char *option
= qemu_opt_get(opts
, "shift");
820 bool sleep
= qemu_opt_get_bool(opts
, "sleep", true);
821 bool align
= qemu_opt_get_bool(opts
, "align", false);
822 long time_shift
= -1;
824 if (!option
&& qemu_opt_get(opts
, "align")) {
825 error_setg(errp
, "Please specify shift option when using align");
829 if (align
&& !sleep
) {
830 error_setg(errp
, "align=on and sleep=off are incompatible");
834 if (strcmp(option
, "auto") != 0) {
835 if (qemu_strtol(option
, NULL
, 0, &time_shift
) < 0
836 || time_shift
< 0 || time_shift
> MAX_ICOUNT_SHIFT
) {
837 error_setg(errp
, "icount: Invalid shift value");
840 } else if (icount_align_option
) {
841 error_setg(errp
, "shift=auto and align=on are incompatible");
843 } else if (!icount_sleep
) {
844 error_setg(errp
, "shift=auto and sleep=off are incompatible");
848 icount_sleep
= sleep
;
850 timers_state
.icount_warp_timer
= timer_new_ns(QEMU_CLOCK_VIRTUAL_RT
,
851 icount_timer_cb
, NULL
);
854 icount_align_option
= align
;
856 if (time_shift
>= 0) {
857 timers_state
.icount_time_shift
= time_shift
;
864 /* 125MIPS seems a reasonable initial guess at the guest speed.
865 It will be corrected fairly quickly anyway. */
866 timers_state
.icount_time_shift
= 3;
868 /* Have both realtime and virtual time triggers for speed adjustment.
869 The realtime trigger catches emulated time passing too slowly,
870 the virtual time trigger catches emulated time passing too fast.
871 Realtime triggers occur even when idle, so use them less frequently
873 timers_state
.vm_clock_warp_start
= -1;
874 timers_state
.icount_rt_timer
= timer_new_ms(QEMU_CLOCK_VIRTUAL_RT
,
875 icount_adjust_rt
, NULL
);
876 timer_mod(timers_state
.icount_rt_timer
,
877 qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT
) + 1000);
878 timers_state
.icount_vm_timer
= timer_new_ns(QEMU_CLOCK_VIRTUAL
,
879 icount_adjust_vm
, NULL
);
880 timer_mod(timers_state
.icount_vm_timer
,
881 qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL
) +
882 NANOSECONDS_PER_SECOND
/ 10);
885 /***********************************************************/
886 /* TCG vCPU kick timer
888 * The kick timer is responsible for moving single threaded vCPU
889 * emulation on to the next vCPU. If more than one vCPU is running a
890 * timer event with force a cpu->exit so the next vCPU can get
893 * The timer is removed if all vCPUs are idle and restarted again once
894 * idleness is complete.
897 static QEMUTimer
*tcg_kick_vcpu_timer
;
898 static CPUState
*tcg_current_rr_cpu
;
900 #define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
902 static inline int64_t qemu_tcg_next_kick(void)
904 return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL
) + TCG_KICK_PERIOD
;
907 /* Kick the currently round-robin scheduled vCPU to next */
908 static void qemu_cpu_kick_rr_next_cpu(void)
912 cpu
= atomic_mb_read(&tcg_current_rr_cpu
);
916 } while (cpu
!= atomic_mb_read(&tcg_current_rr_cpu
));
919 /* Kick all RR vCPUs */
920 static void qemu_cpu_kick_rr_cpus(void)
929 static void do_nothing(CPUState
*cpu
, run_on_cpu_data unused
)
933 void qemu_timer_notify_cb(void *opaque
, QEMUClockType type
)
935 if (!use_icount
|| type
!= QEMU_CLOCK_VIRTUAL
) {
940 if (qemu_in_vcpu_thread()) {
941 /* A CPU is currently running; kick it back out to the
942 * tcg_cpu_exec() loop so it will recalculate its
943 * icount deadline immediately.
945 qemu_cpu_kick(current_cpu
);
946 } else if (first_cpu
) {
947 /* qemu_cpu_kick is not enough to kick a halted CPU out of
948 * qemu_tcg_wait_io_event. async_run_on_cpu, instead,
949 * causes cpu_thread_is_idle to return false. This way,
950 * handle_icount_deadline can run.
951 * If we have no CPUs at all for some reason, we don't
952 * need to do anything.
954 async_run_on_cpu(first_cpu
, do_nothing
, RUN_ON_CPU_NULL
);
958 static void kick_tcg_thread(void *opaque
)
960 timer_mod(tcg_kick_vcpu_timer
, qemu_tcg_next_kick());
961 qemu_cpu_kick_rr_next_cpu();
964 static void start_tcg_kick_timer(void)
966 assert(!mttcg_enabled
);
967 if (!tcg_kick_vcpu_timer
&& CPU_NEXT(first_cpu
)) {
968 tcg_kick_vcpu_timer
= timer_new_ns(QEMU_CLOCK_VIRTUAL
,
969 kick_tcg_thread
, NULL
);
971 if (tcg_kick_vcpu_timer
&& !timer_pending(tcg_kick_vcpu_timer
)) {
972 timer_mod(tcg_kick_vcpu_timer
, qemu_tcg_next_kick());
976 static void stop_tcg_kick_timer(void)
978 assert(!mttcg_enabled
);
979 if (tcg_kick_vcpu_timer
&& timer_pending(tcg_kick_vcpu_timer
)) {
980 timer_del(tcg_kick_vcpu_timer
);
984 /***********************************************************/
985 void hw_error(const char *fmt
, ...)
991 fprintf(stderr
, "qemu: hardware error: ");
992 vfprintf(stderr
, fmt
, ap
);
993 fprintf(stderr
, "\n");
995 fprintf(stderr
, "CPU #%d:\n", cpu
->cpu_index
);
996 cpu_dump_state(cpu
, stderr
, CPU_DUMP_FPU
);
1002 void cpu_synchronize_all_states(void)
1007 cpu_synchronize_state(cpu
);
1008 /* TODO: move to cpu_synchronize_state() */
1009 if (hvf_enabled()) {
1010 hvf_cpu_synchronize_state(cpu
);
1015 void cpu_synchronize_all_post_reset(void)
1020 cpu_synchronize_post_reset(cpu
);
1021 /* TODO: move to cpu_synchronize_post_reset() */
1022 if (hvf_enabled()) {
1023 hvf_cpu_synchronize_post_reset(cpu
);
1028 void cpu_synchronize_all_post_init(void)
1033 cpu_synchronize_post_init(cpu
);
1034 /* TODO: move to cpu_synchronize_post_init() */
1035 if (hvf_enabled()) {
1036 hvf_cpu_synchronize_post_init(cpu
);
1041 void cpu_synchronize_all_pre_loadvm(void)
1046 cpu_synchronize_pre_loadvm(cpu
);
1050 static int do_vm_stop(RunState state
, bool send_stop
)
1054 if (runstate_is_running()) {
1055 runstate_set(state
);
1056 cpu_disable_ticks();
1058 vm_state_notify(0, state
);
1060 qapi_event_send_stop();
1065 ret
= bdrv_flush_all();
1070 /* Special vm_stop() variant for terminating the process. Historically clients
1071 * did not expect a QMP STOP event and so we need to retain compatibility.
1073 int vm_shutdown(void)
1075 return do_vm_stop(RUN_STATE_SHUTDOWN
, false);
1078 static bool cpu_can_run(CPUState
*cpu
)
1083 if (cpu_is_stopped(cpu
)) {
1089 static void cpu_handle_guest_debug(CPUState
*cpu
)
1091 gdb_set_stop_cpu(cpu
);
1092 qemu_system_debug_request();
1093 cpu
->stopped
= true;
1097 static void sigbus_reraise(void)
1100 struct sigaction action
;
1102 memset(&action
, 0, sizeof(action
));
1103 action
.sa_handler
= SIG_DFL
;
1104 if (!sigaction(SIGBUS
, &action
, NULL
)) {
1107 sigaddset(&set
, SIGBUS
);
1108 pthread_sigmask(SIG_UNBLOCK
, &set
, NULL
);
1110 perror("Failed to re-raise SIGBUS!\n");
1114 static void sigbus_handler(int n
, siginfo_t
*siginfo
, void *ctx
)
1116 if (siginfo
->si_code
!= BUS_MCEERR_AO
&& siginfo
->si_code
!= BUS_MCEERR_AR
) {
1121 /* Called asynchronously in VCPU thread. */
1122 if (kvm_on_sigbus_vcpu(current_cpu
, siginfo
->si_code
, siginfo
->si_addr
)) {
1126 /* Called synchronously (via signalfd) in main thread. */
1127 if (kvm_on_sigbus(siginfo
->si_code
, siginfo
->si_addr
)) {
1133 static void qemu_init_sigbus(void)
1135 struct sigaction action
;
1137 memset(&action
, 0, sizeof(action
));
1138 action
.sa_flags
= SA_SIGINFO
;
1139 action
.sa_sigaction
= sigbus_handler
;
1140 sigaction(SIGBUS
, &action
, NULL
);
1142 prctl(PR_MCE_KILL
, PR_MCE_KILL_SET
, PR_MCE_KILL_EARLY
, 0, 0);
1144 #else /* !CONFIG_LINUX */
1145 static void qemu_init_sigbus(void)
1148 #endif /* !CONFIG_LINUX */
1150 static QemuThread io_thread
;
1153 static QemuCond qemu_cpu_cond
;
1155 static QemuCond qemu_pause_cond
;
1157 void qemu_init_cpu_loop(void)
1160 qemu_cond_init(&qemu_cpu_cond
);
1161 qemu_cond_init(&qemu_pause_cond
);
1162 qemu_mutex_init(&qemu_global_mutex
);
1164 qemu_thread_get_self(&io_thread
);
1167 void run_on_cpu(CPUState
*cpu
, run_on_cpu_func func
, run_on_cpu_data data
)
1169 do_run_on_cpu(cpu
, func
, data
, &qemu_global_mutex
);
1172 static void qemu_kvm_destroy_vcpu(CPUState
*cpu
)
1174 if (kvm_destroy_vcpu(cpu
) < 0) {
1175 error_report("kvm_destroy_vcpu failed");
1180 static void qemu_tcg_destroy_vcpu(CPUState
*cpu
)
1184 static void qemu_cpu_stop(CPUState
*cpu
, bool exit
)
1186 g_assert(qemu_cpu_is_self(cpu
));
1188 cpu
->stopped
= true;
1192 qemu_cond_broadcast(&qemu_pause_cond
);
1195 static void qemu_wait_io_event_common(CPUState
*cpu
)
1197 atomic_mb_set(&cpu
->thread_kicked
, false);
1199 qemu_cpu_stop(cpu
, false);
1201 process_queued_cpu_work(cpu
);
1204 static void qemu_tcg_rr_wait_io_event(void)
1208 while (all_cpu_threads_idle()) {
1209 stop_tcg_kick_timer();
1210 qemu_cond_wait(first_cpu
->halt_cond
, &qemu_global_mutex
);
1213 start_tcg_kick_timer();
1216 qemu_wait_io_event_common(cpu
);
1220 static void qemu_wait_io_event(CPUState
*cpu
)
1224 while (cpu_thread_is_idle(cpu
)) {
1227 qemu_plugin_vcpu_idle_cb(cpu
);
1229 qemu_cond_wait(cpu
->halt_cond
, &qemu_global_mutex
);
1232 qemu_plugin_vcpu_resume_cb(cpu
);
1236 /* Eat dummy APC queued by qemu_cpu_kick_thread. */
1237 if (!tcg_enabled()) {
1241 qemu_wait_io_event_common(cpu
);
1244 static void *qemu_kvm_cpu_thread_fn(void *arg
)
1246 CPUState
*cpu
= arg
;
1249 rcu_register_thread();
1251 qemu_mutex_lock_iothread();
1252 qemu_thread_get_self(cpu
->thread
);
1253 cpu
->thread_id
= qemu_get_thread_id();
1257 r
= kvm_init_vcpu(cpu
);
1259 error_report("kvm_init_vcpu failed: %s", strerror(-r
));
1263 kvm_init_cpu_signals(cpu
);
1265 /* signal CPU creation */
1266 cpu
->created
= true;
1267 qemu_cond_signal(&qemu_cpu_cond
);
1268 qemu_guest_random_seed_thread_part2(cpu
->random_seed
);
1271 if (cpu_can_run(cpu
)) {
1272 r
= kvm_cpu_exec(cpu
);
1273 if (r
== EXCP_DEBUG
) {
1274 cpu_handle_guest_debug(cpu
);
1277 qemu_wait_io_event(cpu
);
1278 } while (!cpu
->unplug
|| cpu_can_run(cpu
));
1280 qemu_kvm_destroy_vcpu(cpu
);
1281 cpu
->created
= false;
1282 qemu_cond_signal(&qemu_cpu_cond
);
1283 qemu_mutex_unlock_iothread();
1284 rcu_unregister_thread();
1288 static void *qemu_dummy_cpu_thread_fn(void *arg
)
1291 error_report("qtest is not supported under Windows");
1294 CPUState
*cpu
= arg
;
1298 rcu_register_thread();
1300 qemu_mutex_lock_iothread();
1301 qemu_thread_get_self(cpu
->thread
);
1302 cpu
->thread_id
= qemu_get_thread_id();
1306 sigemptyset(&waitset
);
1307 sigaddset(&waitset
, SIG_IPI
);
1309 /* signal CPU creation */
1310 cpu
->created
= true;
1311 qemu_cond_signal(&qemu_cpu_cond
);
1312 qemu_guest_random_seed_thread_part2(cpu
->random_seed
);
1315 qemu_mutex_unlock_iothread();
1318 r
= sigwait(&waitset
, &sig
);
1319 } while (r
== -1 && (errno
== EAGAIN
|| errno
== EINTR
));
1324 qemu_mutex_lock_iothread();
1325 qemu_wait_io_event(cpu
);
1326 } while (!cpu
->unplug
);
1328 qemu_mutex_unlock_iothread();
1329 rcu_unregister_thread();
1334 static int64_t tcg_get_icount_limit(void)
1338 if (replay_mode
!= REPLAY_MODE_PLAY
) {
1340 * Include all the timers, because they may need an attention.
1341 * Too long CPU execution may create unnecessary delay in UI.
1343 deadline
= qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL
,
1344 QEMU_TIMER_ATTR_ALL
);
1345 /* Check realtime timers, because they help with input processing */
1346 deadline
= qemu_soonest_timeout(deadline
,
1347 qemu_clock_deadline_ns_all(QEMU_CLOCK_REALTIME
,
1348 QEMU_TIMER_ATTR_ALL
));
1350 /* Maintain prior (possibly buggy) behaviour where if no deadline
1351 * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
1352 * INT32_MAX nanoseconds ahead, we still use INT32_MAX
1355 if ((deadline
< 0) || (deadline
> INT32_MAX
)) {
1356 deadline
= INT32_MAX
;
1359 return qemu_icount_round(deadline
);
1361 return replay_get_instructions();
1365 static void handle_icount_deadline(void)
1367 assert(qemu_in_vcpu_thread());
1369 int64_t deadline
= qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL
,
1370 QEMU_TIMER_ATTR_ALL
);
1372 if (deadline
== 0) {
1373 /* Wake up other AioContexts. */
1374 qemu_clock_notify(QEMU_CLOCK_VIRTUAL
);
1375 qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL
);
1380 static void prepare_icount_for_run(CPUState
*cpu
)
1385 /* These should always be cleared by process_icount_data after
1386 * each vCPU execution. However u16.high can be raised
1387 * asynchronously by cpu_exit/cpu_interrupt/tcg_handle_interrupt
1389 g_assert(cpu_neg(cpu
)->icount_decr
.u16
.low
== 0);
1390 g_assert(cpu
->icount_extra
== 0);
1392 cpu
->icount_budget
= tcg_get_icount_limit();
1393 insns_left
= MIN(0xffff, cpu
->icount_budget
);
1394 cpu_neg(cpu
)->icount_decr
.u16
.low
= insns_left
;
1395 cpu
->icount_extra
= cpu
->icount_budget
- insns_left
;
1397 replay_mutex_lock();
1401 static void process_icount_data(CPUState
*cpu
)
1404 /* Account for executed instructions */
1405 cpu_update_icount(cpu
);
1407 /* Reset the counters */
1408 cpu_neg(cpu
)->icount_decr
.u16
.low
= 0;
1409 cpu
->icount_extra
= 0;
1410 cpu
->icount_budget
= 0;
1412 replay_account_executed_instructions();
1414 replay_mutex_unlock();
1419 static int tcg_cpu_exec(CPUState
*cpu
)
1422 #ifdef CONFIG_PROFILER
1426 assert(tcg_enabled());
1427 #ifdef CONFIG_PROFILER
1428 ti
= profile_getclock();
1430 cpu_exec_start(cpu
);
1431 ret
= cpu_exec(cpu
);
1433 #ifdef CONFIG_PROFILER
1434 atomic_set(&tcg_ctx
->prof
.cpu_exec_time
,
1435 tcg_ctx
->prof
.cpu_exec_time
+ profile_getclock() - ti
);
1440 /* Destroy any remaining vCPUs which have been unplugged and have
1443 static void deal_with_unplugged_cpus(void)
1448 if (cpu
->unplug
&& !cpu_can_run(cpu
)) {
1449 qemu_tcg_destroy_vcpu(cpu
);
1450 cpu
->created
= false;
1451 qemu_cond_signal(&qemu_cpu_cond
);
1457 /* Single-threaded TCG
1459 * In the single-threaded case each vCPU is simulated in turn. If
1460 * there is more than a single vCPU we create a simple timer to kick
1461 * the vCPU and ensure we don't get stuck in a tight loop in one vCPU.
1462 * This is done explicitly rather than relying on side-effects
1466 static void *qemu_tcg_rr_cpu_thread_fn(void *arg
)
1468 CPUState
*cpu
= arg
;
1470 assert(tcg_enabled());
1471 rcu_register_thread();
1472 tcg_register_thread();
1474 qemu_mutex_lock_iothread();
1475 qemu_thread_get_self(cpu
->thread
);
1477 cpu
->thread_id
= qemu_get_thread_id();
1478 cpu
->created
= true;
1480 qemu_cond_signal(&qemu_cpu_cond
);
1481 qemu_guest_random_seed_thread_part2(cpu
->random_seed
);
1483 /* wait for initial kick-off after machine start */
1484 while (first_cpu
->stopped
) {
1485 qemu_cond_wait(first_cpu
->halt_cond
, &qemu_global_mutex
);
1487 /* process any pending work */
1490 qemu_wait_io_event_common(cpu
);
1494 start_tcg_kick_timer();
1498 /* process any pending work */
1499 cpu
->exit_request
= 1;
1502 qemu_mutex_unlock_iothread();
1503 replay_mutex_lock();
1504 qemu_mutex_lock_iothread();
1505 /* Account partial waits to QEMU_CLOCK_VIRTUAL. */
1506 qemu_account_warp_timer();
1508 /* Run the timers here. This is much more efficient than
1509 * waking up the I/O thread and waiting for completion.
1511 handle_icount_deadline();
1513 replay_mutex_unlock();
1519 while (cpu
&& !cpu
->queued_work_first
&& !cpu
->exit_request
) {
1521 atomic_mb_set(&tcg_current_rr_cpu
, cpu
);
1524 qemu_clock_enable(QEMU_CLOCK_VIRTUAL
,
1525 (cpu
->singlestep_enabled
& SSTEP_NOTIMER
) == 0);
1527 if (cpu_can_run(cpu
)) {
1530 qemu_mutex_unlock_iothread();
1531 prepare_icount_for_run(cpu
);
1533 r
= tcg_cpu_exec(cpu
);
1535 process_icount_data(cpu
);
1536 qemu_mutex_lock_iothread();
1538 if (r
== EXCP_DEBUG
) {
1539 cpu_handle_guest_debug(cpu
);
1541 } else if (r
== EXCP_ATOMIC
) {
1542 qemu_mutex_unlock_iothread();
1543 cpu_exec_step_atomic(cpu
);
1544 qemu_mutex_lock_iothread();
1547 } else if (cpu
->stop
) {
1549 cpu
= CPU_NEXT(cpu
);
1554 cpu
= CPU_NEXT(cpu
);
1555 } /* while (cpu && !cpu->exit_request).. */
1557 /* Does not need atomic_mb_set because a spurious wakeup is okay. */
1558 atomic_set(&tcg_current_rr_cpu
, NULL
);
1560 if (cpu
&& cpu
->exit_request
) {
1561 atomic_mb_set(&cpu
->exit_request
, 0);
1564 if (use_icount
&& all_cpu_threads_idle()) {
1566 * When all cpus are sleeping (e.g in WFI), to avoid a deadlock
1567 * in the main_loop, wake it up in order to start the warp timer.
1569 qemu_notify_event();
1572 qemu_tcg_rr_wait_io_event();
1573 deal_with_unplugged_cpus();
1576 rcu_unregister_thread();
1580 static void *qemu_hax_cpu_thread_fn(void *arg
)
1582 CPUState
*cpu
= arg
;
1585 rcu_register_thread();
1586 qemu_mutex_lock_iothread();
1587 qemu_thread_get_self(cpu
->thread
);
1589 cpu
->thread_id
= qemu_get_thread_id();
1590 cpu
->created
= true;
1594 qemu_cond_signal(&qemu_cpu_cond
);
1595 qemu_guest_random_seed_thread_part2(cpu
->random_seed
);
1598 if (cpu_can_run(cpu
)) {
1599 r
= hax_smp_cpu_exec(cpu
);
1600 if (r
== EXCP_DEBUG
) {
1601 cpu_handle_guest_debug(cpu
);
1605 qemu_wait_io_event(cpu
);
1606 } while (!cpu
->unplug
|| cpu_can_run(cpu
));
1607 rcu_unregister_thread();
1611 /* The HVF-specific vCPU thread function. This one should only run when the host
1612 * CPU supports the VMX "unrestricted guest" feature. */
1613 static void *qemu_hvf_cpu_thread_fn(void *arg
)
1615 CPUState
*cpu
= arg
;
1619 assert(hvf_enabled());
1621 rcu_register_thread();
1623 qemu_mutex_lock_iothread();
1624 qemu_thread_get_self(cpu
->thread
);
1626 cpu
->thread_id
= qemu_get_thread_id();
1632 /* signal CPU creation */
1633 cpu
->created
= true;
1634 qemu_cond_signal(&qemu_cpu_cond
);
1635 qemu_guest_random_seed_thread_part2(cpu
->random_seed
);
1638 if (cpu_can_run(cpu
)) {
1639 r
= hvf_vcpu_exec(cpu
);
1640 if (r
== EXCP_DEBUG
) {
1641 cpu_handle_guest_debug(cpu
);
1644 qemu_wait_io_event(cpu
);
1645 } while (!cpu
->unplug
|| cpu_can_run(cpu
));
1647 hvf_vcpu_destroy(cpu
);
1648 cpu
->created
= false;
1649 qemu_cond_signal(&qemu_cpu_cond
);
1650 qemu_mutex_unlock_iothread();
1651 rcu_unregister_thread();
1655 static void *qemu_whpx_cpu_thread_fn(void *arg
)
1657 CPUState
*cpu
= arg
;
1660 rcu_register_thread();
1662 qemu_mutex_lock_iothread();
1663 qemu_thread_get_self(cpu
->thread
);
1664 cpu
->thread_id
= qemu_get_thread_id();
1667 r
= whpx_init_vcpu(cpu
);
1669 fprintf(stderr
, "whpx_init_vcpu failed: %s\n", strerror(-r
));
1673 /* signal CPU creation */
1674 cpu
->created
= true;
1675 qemu_cond_signal(&qemu_cpu_cond
);
1676 qemu_guest_random_seed_thread_part2(cpu
->random_seed
);
1679 if (cpu_can_run(cpu
)) {
1680 r
= whpx_vcpu_exec(cpu
);
1681 if (r
== EXCP_DEBUG
) {
1682 cpu_handle_guest_debug(cpu
);
1685 while (cpu_thread_is_idle(cpu
)) {
1686 qemu_cond_wait(cpu
->halt_cond
, &qemu_global_mutex
);
1688 qemu_wait_io_event_common(cpu
);
1689 } while (!cpu
->unplug
|| cpu_can_run(cpu
));
1691 whpx_destroy_vcpu(cpu
);
1692 cpu
->created
= false;
1693 qemu_cond_signal(&qemu_cpu_cond
);
1694 qemu_mutex_unlock_iothread();
1695 rcu_unregister_thread();
1700 static void CALLBACK
dummy_apc_func(ULONG_PTR unused
)
1705 /* Multi-threaded TCG
1707 * In the multi-threaded case each vCPU has its own thread. The TLS
1708 * variable current_cpu can be used deep in the code to find the
1709 * current CPUState for a given thread.
1712 static void *qemu_tcg_cpu_thread_fn(void *arg
)
1714 CPUState
*cpu
= arg
;
1716 assert(tcg_enabled());
1717 g_assert(!use_icount
);
1719 rcu_register_thread();
1720 tcg_register_thread();
1722 qemu_mutex_lock_iothread();
1723 qemu_thread_get_self(cpu
->thread
);
1725 cpu
->thread_id
= qemu_get_thread_id();
1726 cpu
->created
= true;
1729 qemu_cond_signal(&qemu_cpu_cond
);
1730 qemu_guest_random_seed_thread_part2(cpu
->random_seed
);
1732 /* process any pending work */
1733 cpu
->exit_request
= 1;
1736 if (cpu_can_run(cpu
)) {
1738 qemu_mutex_unlock_iothread();
1739 r
= tcg_cpu_exec(cpu
);
1740 qemu_mutex_lock_iothread();
1743 cpu_handle_guest_debug(cpu
);
1746 /* during start-up the vCPU is reset and the thread is
1747 * kicked several times. If we don't ensure we go back
1748 * to sleep in the halted state we won't cleanly
1749 * start-up when the vCPU is enabled.
1751 * cpu->halted should ensure we sleep in wait_io_event
1753 g_assert(cpu
->halted
);
1756 qemu_mutex_unlock_iothread();
1757 cpu_exec_step_atomic(cpu
);
1758 qemu_mutex_lock_iothread();
1760 /* Ignore everything else? */
1765 atomic_mb_set(&cpu
->exit_request
, 0);
1766 qemu_wait_io_event(cpu
);
1767 } while (!cpu
->unplug
|| cpu_can_run(cpu
));
1769 qemu_tcg_destroy_vcpu(cpu
);
1770 cpu
->created
= false;
1771 qemu_cond_signal(&qemu_cpu_cond
);
1772 qemu_mutex_unlock_iothread();
1773 rcu_unregister_thread();
1777 static void qemu_cpu_kick_thread(CPUState
*cpu
)
1782 if (cpu
->thread_kicked
) {
1785 cpu
->thread_kicked
= true;
1786 err
= pthread_kill(cpu
->thread
->thread
, SIG_IPI
);
1787 if (err
&& err
!= ESRCH
) {
1788 fprintf(stderr
, "qemu:%s: %s", __func__
, strerror(err
));
1792 if (!qemu_cpu_is_self(cpu
)) {
1793 if (whpx_enabled()) {
1794 whpx_vcpu_kick(cpu
);
1795 } else if (!QueueUserAPC(dummy_apc_func
, cpu
->hThread
, 0)) {
1796 fprintf(stderr
, "%s: QueueUserAPC failed with error %lu\n",
1797 __func__
, GetLastError());
1804 void qemu_cpu_kick(CPUState
*cpu
)
1806 qemu_cond_broadcast(cpu
->halt_cond
);
1807 if (tcg_enabled()) {
1808 if (qemu_tcg_mttcg_enabled()) {
1811 qemu_cpu_kick_rr_cpus();
1814 if (hax_enabled()) {
1816 * FIXME: race condition with the exit_request check in
1819 cpu
->exit_request
= 1;
1821 qemu_cpu_kick_thread(cpu
);
1825 void qemu_cpu_kick_self(void)
1827 assert(current_cpu
);
1828 qemu_cpu_kick_thread(current_cpu
);
1831 bool qemu_cpu_is_self(CPUState
*cpu
)
1833 return qemu_thread_is_self(cpu
->thread
);
1836 bool qemu_in_vcpu_thread(void)
1838 return current_cpu
&& qemu_cpu_is_self(current_cpu
);
1841 static __thread
bool iothread_locked
= false;
1843 bool qemu_mutex_iothread_locked(void)
1845 return iothread_locked
;
1849 * The BQL is taken from so many places that it is worth profiling the
1850 * callers directly, instead of funneling them all through a single function.
1852 void qemu_mutex_lock_iothread_impl(const char *file
, int line
)
1854 QemuMutexLockFunc bql_lock
= atomic_read(&qemu_bql_mutex_lock_func
);
1856 g_assert(!qemu_mutex_iothread_locked());
1857 bql_lock(&qemu_global_mutex
, file
, line
);
1858 iothread_locked
= true;
1861 void qemu_mutex_unlock_iothread(void)
1863 g_assert(qemu_mutex_iothread_locked());
1864 iothread_locked
= false;
1865 qemu_mutex_unlock(&qemu_global_mutex
);
1868 void qemu_cond_wait_iothread(QemuCond
*cond
)
1870 qemu_cond_wait(cond
, &qemu_global_mutex
);
1873 static bool all_vcpus_paused(void)
1878 if (!cpu
->stopped
) {
1886 void pause_all_vcpus(void)
1890 qemu_clock_enable(QEMU_CLOCK_VIRTUAL
, false);
1892 if (qemu_cpu_is_self(cpu
)) {
1893 qemu_cpu_stop(cpu
, true);
1900 /* We need to drop the replay_lock so any vCPU threads woken up
1901 * can finish their replay tasks
1903 replay_mutex_unlock();
1905 while (!all_vcpus_paused()) {
1906 qemu_cond_wait(&qemu_pause_cond
, &qemu_global_mutex
);
1912 qemu_mutex_unlock_iothread();
1913 replay_mutex_lock();
1914 qemu_mutex_lock_iothread();
1917 void cpu_resume(CPUState
*cpu
)
1920 cpu
->stopped
= false;
1924 void resume_all_vcpus(void)
1928 if (!runstate_is_running()) {
1932 qemu_clock_enable(QEMU_CLOCK_VIRTUAL
, true);
1938 void cpu_remove_sync(CPUState
*cpu
)
1943 qemu_mutex_unlock_iothread();
1944 qemu_thread_join(cpu
->thread
);
1945 qemu_mutex_lock_iothread();
1948 /* For temporary buffers for forming a name */
1949 #define VCPU_THREAD_NAME_SIZE 16
1951 static void qemu_tcg_init_vcpu(CPUState
*cpu
)
1953 char thread_name
[VCPU_THREAD_NAME_SIZE
];
1954 static QemuCond
*single_tcg_halt_cond
;
1955 static QemuThread
*single_tcg_cpu_thread
;
1956 static int tcg_region_inited
;
1958 assert(tcg_enabled());
1960 * Initialize TCG regions--once. Now is a good time, because:
1961 * (1) TCG's init context, prologue and target globals have been set up.
1962 * (2) qemu_tcg_mttcg_enabled() works now (TCG init code runs before the
1963 * -accel flag is processed, so the check doesn't work then).
1965 if (!tcg_region_inited
) {
1966 tcg_region_inited
= 1;
1970 if (qemu_tcg_mttcg_enabled() || !single_tcg_cpu_thread
) {
1971 cpu
->thread
= g_malloc0(sizeof(QemuThread
));
1972 cpu
->halt_cond
= g_malloc0(sizeof(QemuCond
));
1973 qemu_cond_init(cpu
->halt_cond
);
1975 if (qemu_tcg_mttcg_enabled()) {
1976 /* create a thread per vCPU with TCG (MTTCG) */
1977 parallel_cpus
= true;
1978 snprintf(thread_name
, VCPU_THREAD_NAME_SIZE
, "CPU %d/TCG",
1981 qemu_thread_create(cpu
->thread
, thread_name
, qemu_tcg_cpu_thread_fn
,
1982 cpu
, QEMU_THREAD_JOINABLE
);
1985 /* share a single thread for all cpus with TCG */
1986 snprintf(thread_name
, VCPU_THREAD_NAME_SIZE
, "ALL CPUs/TCG");
1987 qemu_thread_create(cpu
->thread
, thread_name
,
1988 qemu_tcg_rr_cpu_thread_fn
,
1989 cpu
, QEMU_THREAD_JOINABLE
);
1991 single_tcg_halt_cond
= cpu
->halt_cond
;
1992 single_tcg_cpu_thread
= cpu
->thread
;
1995 cpu
->hThread
= qemu_thread_get_handle(cpu
->thread
);
1998 /* For non-MTTCG cases we share the thread */
1999 cpu
->thread
= single_tcg_cpu_thread
;
2000 cpu
->halt_cond
= single_tcg_halt_cond
;
2001 cpu
->thread_id
= first_cpu
->thread_id
;
2003 cpu
->created
= true;
2007 static void qemu_hax_start_vcpu(CPUState
*cpu
)
2009 char thread_name
[VCPU_THREAD_NAME_SIZE
];
2011 cpu
->thread
= g_malloc0(sizeof(QemuThread
));
2012 cpu
->halt_cond
= g_malloc0(sizeof(QemuCond
));
2013 qemu_cond_init(cpu
->halt_cond
);
2015 snprintf(thread_name
, VCPU_THREAD_NAME_SIZE
, "CPU %d/HAX",
2017 qemu_thread_create(cpu
->thread
, thread_name
, qemu_hax_cpu_thread_fn
,
2018 cpu
, QEMU_THREAD_JOINABLE
);
2020 cpu
->hThread
= qemu_thread_get_handle(cpu
->thread
);
2024 static void qemu_kvm_start_vcpu(CPUState
*cpu
)
2026 char thread_name
[VCPU_THREAD_NAME_SIZE
];
2028 cpu
->thread
= g_malloc0(sizeof(QemuThread
));
2029 cpu
->halt_cond
= g_malloc0(sizeof(QemuCond
));
2030 qemu_cond_init(cpu
->halt_cond
);
2031 snprintf(thread_name
, VCPU_THREAD_NAME_SIZE
, "CPU %d/KVM",
2033 qemu_thread_create(cpu
->thread
, thread_name
, qemu_kvm_cpu_thread_fn
,
2034 cpu
, QEMU_THREAD_JOINABLE
);
2037 static void qemu_hvf_start_vcpu(CPUState
*cpu
)
2039 char thread_name
[VCPU_THREAD_NAME_SIZE
];
2041 /* HVF currently does not support TCG, and only runs in
2042 * unrestricted-guest mode. */
2043 assert(hvf_enabled());
2045 cpu
->thread
= g_malloc0(sizeof(QemuThread
));
2046 cpu
->halt_cond
= g_malloc0(sizeof(QemuCond
));
2047 qemu_cond_init(cpu
->halt_cond
);
2049 snprintf(thread_name
, VCPU_THREAD_NAME_SIZE
, "CPU %d/HVF",
2051 qemu_thread_create(cpu
->thread
, thread_name
, qemu_hvf_cpu_thread_fn
,
2052 cpu
, QEMU_THREAD_JOINABLE
);
2055 static void qemu_whpx_start_vcpu(CPUState
*cpu
)
2057 char thread_name
[VCPU_THREAD_NAME_SIZE
];
2059 cpu
->thread
= g_malloc0(sizeof(QemuThread
));
2060 cpu
->halt_cond
= g_malloc0(sizeof(QemuCond
));
2061 qemu_cond_init(cpu
->halt_cond
);
2062 snprintf(thread_name
, VCPU_THREAD_NAME_SIZE
, "CPU %d/WHPX",
2064 qemu_thread_create(cpu
->thread
, thread_name
, qemu_whpx_cpu_thread_fn
,
2065 cpu
, QEMU_THREAD_JOINABLE
);
2067 cpu
->hThread
= qemu_thread_get_handle(cpu
->thread
);
2071 static void qemu_dummy_start_vcpu(CPUState
*cpu
)
2073 char thread_name
[VCPU_THREAD_NAME_SIZE
];
2075 cpu
->thread
= g_malloc0(sizeof(QemuThread
));
2076 cpu
->halt_cond
= g_malloc0(sizeof(QemuCond
));
2077 qemu_cond_init(cpu
->halt_cond
);
2078 snprintf(thread_name
, VCPU_THREAD_NAME_SIZE
, "CPU %d/DUMMY",
2080 qemu_thread_create(cpu
->thread
, thread_name
, qemu_dummy_cpu_thread_fn
, cpu
,
2081 QEMU_THREAD_JOINABLE
);
2084 void qemu_init_vcpu(CPUState
*cpu
)
2086 MachineState
*ms
= MACHINE(qdev_get_machine());
2088 cpu
->nr_cores
= ms
->smp
.cores
;
2089 cpu
->nr_threads
= ms
->smp
.threads
;
2090 cpu
->stopped
= true;
2091 cpu
->random_seed
= qemu_guest_random_seed_thread_part1();
2094 /* If the target cpu hasn't set up any address spaces itself,
2095 * give it the default one.
2098 cpu_address_space_init(cpu
, 0, "cpu-memory", cpu
->memory
);
2101 if (kvm_enabled()) {
2102 qemu_kvm_start_vcpu(cpu
);
2103 } else if (hax_enabled()) {
2104 qemu_hax_start_vcpu(cpu
);
2105 } else if (hvf_enabled()) {
2106 qemu_hvf_start_vcpu(cpu
);
2107 } else if (tcg_enabled()) {
2108 qemu_tcg_init_vcpu(cpu
);
2109 } else if (whpx_enabled()) {
2110 qemu_whpx_start_vcpu(cpu
);
2112 qemu_dummy_start_vcpu(cpu
);
2115 while (!cpu
->created
) {
2116 qemu_cond_wait(&qemu_cpu_cond
, &qemu_global_mutex
);
2120 void cpu_stop_current(void)
2123 current_cpu
->stop
= true;
2124 cpu_exit(current_cpu
);
2128 int vm_stop(RunState state
)
2130 if (qemu_in_vcpu_thread()) {
2131 qemu_system_vmstop_request_prepare();
2132 qemu_system_vmstop_request(state
);
2134 * FIXME: should not return to device code in case
2135 * vm_stop() has been requested.
2141 return do_vm_stop(state
, true);
2145 * Prepare for (re)starting the VM.
2146 * Returns -1 if the vCPUs are not to be restarted (e.g. if they are already
2147 * running or in case of an error condition), 0 otherwise.
2149 int vm_prepare_start(void)
2153 qemu_vmstop_requested(&requested
);
2154 if (runstate_is_running() && requested
== RUN_STATE__MAX
) {
2158 /* Ensure that a STOP/RESUME pair of events is emitted if a
2159 * vmstop request was pending. The BLOCK_IO_ERROR event, for
2160 * example, according to documentation is always followed by
2163 if (runstate_is_running()) {
2164 qapi_event_send_stop();
2165 qapi_event_send_resume();
2169 /* We are sending this now, but the CPUs will be resumed shortly later */
2170 qapi_event_send_resume();
2173 runstate_set(RUN_STATE_RUNNING
);
2174 vm_state_notify(1, RUN_STATE_RUNNING
);
2180 if (!vm_prepare_start()) {
2185 /* does a state transition even if the VM is already stopped,
2186 current state is forgotten forever */
2187 int vm_stop_force_state(RunState state
)
2189 if (runstate_is_running()) {
2190 return vm_stop(state
);
2192 runstate_set(state
);
2195 /* Make sure to return an error if the flush in a previous vm_stop()
2197 return bdrv_flush_all();
2201 void list_cpus(const char *optarg
)
2203 /* XXX: implement xxx_cpu_list for targets that still miss it */
2204 #if defined(cpu_list)
2209 void qmp_memsave(int64_t addr
, int64_t size
, const char *filename
,
2210 bool has_cpu
, int64_t cpu_index
, Error
**errp
)
2216 int64_t orig_addr
= addr
, orig_size
= size
;
2222 cpu
= qemu_get_cpu(cpu_index
);
2224 error_setg(errp
, QERR_INVALID_PARAMETER_VALUE
, "cpu-index",
2229 f
= fopen(filename
, "wb");
2231 error_setg_file_open(errp
, errno
, filename
);
2239 if (cpu_memory_rw_debug(cpu
, addr
, buf
, l
, 0) != 0) {
2240 error_setg(errp
, "Invalid addr 0x%016" PRIx64
"/size %" PRId64
2241 " specified", orig_addr
, orig_size
);
2244 if (fwrite(buf
, 1, l
, f
) != l
) {
2245 error_setg(errp
, QERR_IO_ERROR
);
2256 void qmp_pmemsave(int64_t addr
, int64_t size
, const char *filename
,
2263 f
= fopen(filename
, "wb");
2265 error_setg_file_open(errp
, errno
, filename
);
2273 cpu_physical_memory_read(addr
, buf
, l
);
2274 if (fwrite(buf
, 1, l
, f
) != l
) {
2275 error_setg(errp
, QERR_IO_ERROR
);
2286 void qmp_inject_nmi(Error
**errp
)
2288 nmi_monitor_handle(monitor_get_cpu_index(), errp
);
2291 void dump_drift_info(void)
2297 qemu_printf("Host - Guest clock %"PRIi64
" ms\n",
2298 (cpu_get_clock() - cpu_get_icount())/SCALE_MS
);
2299 if (icount_align_option
) {
2300 qemu_printf("Max guest delay %"PRIi64
" ms\n",
2301 -max_delay
/ SCALE_MS
);
2302 qemu_printf("Max guest advance %"PRIi64
" ms\n",
2303 max_advance
/ SCALE_MS
);
2305 qemu_printf("Max guest delay NA\n");
2306 qemu_printf("Max guest advance NA\n");