4 * Copyright (c) 2003-2008 Fabrice Bellard
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
25 #include "qemu/osdep.h"
26 #include "qemu-common.h"
27 #include "qemu/config-file.h"
28 #include "migration/vmstate.h"
29 #include "monitor/monitor.h"
30 #include "qapi/error.h"
31 #include "qapi/qapi-commands-misc.h"
32 #include "qapi/qapi-events-run-state.h"
33 #include "qapi/qmp/qerror.h"
34 #include "qemu/error-report.h"
35 #include "qemu/qemu-print.h"
36 #include "sysemu/tcg.h"
37 #include "sysemu/block-backend.h"
38 #include "exec/gdbstub.h"
39 #include "sysemu/dma.h"
40 #include "sysemu/hw_accel.h"
41 #include "sysemu/kvm.h"
42 #include "sysemu/hax.h"
43 #include "sysemu/hvf.h"
44 #include "sysemu/whpx.h"
45 #include "exec/exec-all.h"
47 #include "qemu/thread.h"
48 #include "sysemu/cpus.h"
49 #include "sysemu/qtest.h"
50 #include "qemu/main-loop.h"
51 #include "qemu/option.h"
52 #include "qemu/bitmap.h"
53 #include "qemu/seqlock.h"
54 #include "qemu/guest-random.h"
57 #include "sysemu/replay.h"
58 #include "sysemu/runstate.h"
59 #include "hw/boards.h"
64 #include <sys/prctl.h>
67 #define PR_MCE_KILL 33
70 #ifndef PR_MCE_KILL_SET
71 #define PR_MCE_KILL_SET 1
74 #ifndef PR_MCE_KILL_EARLY
75 #define PR_MCE_KILL_EARLY 1
78 #endif /* CONFIG_LINUX */
83 /* vcpu throttling controls */
84 static QEMUTimer
*throttle_timer
;
85 static unsigned int throttle_percentage
;
87 #define CPU_THROTTLE_PCT_MIN 1
88 #define CPU_THROTTLE_PCT_MAX 99
89 #define CPU_THROTTLE_TIMESLICE_NS 10000000
91 bool cpu_is_stopped(CPUState
*cpu
)
93 return cpu
->stopped
|| !runstate_is_running();
96 static bool cpu_thread_is_idle(CPUState
*cpu
)
98 if (cpu
->stop
|| cpu
->queued_work_first
) {
101 if (cpu_is_stopped(cpu
)) {
104 if (!cpu
->halted
|| cpu_has_work(cpu
) ||
105 kvm_halt_in_kernel()) {
111 static bool all_cpu_threads_idle(void)
116 if (!cpu_thread_is_idle(cpu
)) {
123 /***********************************************************/
124 /* guest cycle counter */
126 /* Protected by TimersState seqlock */
128 static bool icount_sleep
= true;
129 /* Arbitrarily pick 1MIPS as the minimum allowable speed. */
130 #define MAX_ICOUNT_SHIFT 10
132 typedef struct TimersState
{
133 /* Protected by BQL. */
134 int64_t cpu_ticks_prev
;
135 int64_t cpu_ticks_offset
;
137 /* Protect fields that can be respectively read outside the
138 * BQL, and written from multiple threads.
140 QemuSeqLock vm_clock_seqlock
;
141 QemuSpin vm_clock_lock
;
143 int16_t cpu_ticks_enabled
;
145 /* Conversion factor from emulated instructions to virtual clock ticks. */
146 int16_t icount_time_shift
;
148 /* Compensate for varying guest execution speed. */
149 int64_t qemu_icount_bias
;
151 int64_t vm_clock_warp_start
;
152 int64_t cpu_clock_offset
;
154 /* Only written by TCG thread */
157 /* for adjusting icount */
158 QEMUTimer
*icount_rt_timer
;
159 QEMUTimer
*icount_vm_timer
;
160 QEMUTimer
*icount_warp_timer
;
163 static TimersState timers_state
;
167 * We default to false if we know other options have been enabled
168 * which are currently incompatible with MTTCG. Otherwise when each
169 * guest (target) has been updated to support:
170 * - atomic instructions
171 * - memory ordering primitives (barriers)
172 * they can set the appropriate CONFIG flags in ${target}-softmmu.mak
174 * Once a guest architecture has been converted to the new primitives
175 * there are two remaining limitations to check.
177 * - The guest can't be oversized (e.g. 64 bit guest on 32 bit host)
178 * - The host must have a stronger memory order than the guest
180 * It may be possible in future to support strong guests on weak hosts
181 * but that will require tagging all load/stores in a guest with their
182 * implicit memory order requirements which would likely slow things
186 static bool check_tcg_memory_orders_compatible(void)
188 #if defined(TCG_GUEST_DEFAULT_MO) && defined(TCG_TARGET_DEFAULT_MO)
189 return (TCG_GUEST_DEFAULT_MO
& ~TCG_TARGET_DEFAULT_MO
) == 0;
195 static bool default_mttcg_enabled(void)
197 if (use_icount
|| TCG_OVERSIZED_GUEST
) {
200 #ifdef TARGET_SUPPORTS_MTTCG
201 return check_tcg_memory_orders_compatible();
208 void qemu_tcg_configure(QemuOpts
*opts
, Error
**errp
)
210 const char *t
= qemu_opt_get(opts
, "thread");
212 if (strcmp(t
, "multi") == 0) {
213 if (TCG_OVERSIZED_GUEST
) {
214 error_setg(errp
, "No MTTCG when guest word size > hosts");
215 } else if (use_icount
) {
216 error_setg(errp
, "No MTTCG when icount is enabled");
218 #ifndef TARGET_SUPPORTS_MTTCG
219 warn_report("Guest not yet converted to MTTCG - "
220 "you may get unexpected results");
222 if (!check_tcg_memory_orders_compatible()) {
223 warn_report("Guest expects a stronger memory ordering "
224 "than the host provides");
225 error_printf("This may cause strange/hard to debug errors\n");
227 mttcg_enabled
= true;
229 } else if (strcmp(t
, "single") == 0) {
230 mttcg_enabled
= false;
232 error_setg(errp
, "Invalid 'thread' setting %s", t
);
235 mttcg_enabled
= default_mttcg_enabled();
239 /* The current number of executed instructions is based on what we
240 * originally budgeted minus the current state of the decrementing
241 * icount counters in extra/u16.low.
243 static int64_t cpu_get_icount_executed(CPUState
*cpu
)
245 return (cpu
->icount_budget
-
246 (cpu_neg(cpu
)->icount_decr
.u16
.low
+ cpu
->icount_extra
));
250 * Update the global shared timer_state.qemu_icount to take into
251 * account executed instructions. This is done by the TCG vCPU
252 * thread so the main-loop can see time has moved forward.
254 static void cpu_update_icount_locked(CPUState
*cpu
)
256 int64_t executed
= cpu_get_icount_executed(cpu
);
257 cpu
->icount_budget
-= executed
;
259 atomic_set_i64(&timers_state
.qemu_icount
,
260 timers_state
.qemu_icount
+ executed
);
264 * Update the global shared timer_state.qemu_icount to take into
265 * account executed instructions. This is done by the TCG vCPU
266 * thread so the main-loop can see time has moved forward.
268 void cpu_update_icount(CPUState
*cpu
)
270 seqlock_write_lock(&timers_state
.vm_clock_seqlock
,
271 &timers_state
.vm_clock_lock
);
272 cpu_update_icount_locked(cpu
);
273 seqlock_write_unlock(&timers_state
.vm_clock_seqlock
,
274 &timers_state
.vm_clock_lock
);
277 static int64_t cpu_get_icount_raw_locked(void)
279 CPUState
*cpu
= current_cpu
;
281 if (cpu
&& cpu
->running
) {
282 if (!cpu
->can_do_io
) {
283 error_report("Bad icount read");
286 /* Take into account what has run */
287 cpu_update_icount_locked(cpu
);
289 /* The read is protected by the seqlock, but needs atomic64 to avoid UB */
290 return atomic_read_i64(&timers_state
.qemu_icount
);
293 static int64_t cpu_get_icount_locked(void)
295 int64_t icount
= cpu_get_icount_raw_locked();
296 return atomic_read_i64(&timers_state
.qemu_icount_bias
) +
297 cpu_icount_to_ns(icount
);
300 int64_t cpu_get_icount_raw(void)
306 start
= seqlock_read_begin(&timers_state
.vm_clock_seqlock
);
307 icount
= cpu_get_icount_raw_locked();
308 } while (seqlock_read_retry(&timers_state
.vm_clock_seqlock
, start
));
313 /* Return the virtual CPU time, based on the instruction counter. */
314 int64_t cpu_get_icount(void)
320 start
= seqlock_read_begin(&timers_state
.vm_clock_seqlock
);
321 icount
= cpu_get_icount_locked();
322 } while (seqlock_read_retry(&timers_state
.vm_clock_seqlock
, start
));
327 int64_t cpu_icount_to_ns(int64_t icount
)
329 return icount
<< atomic_read(&timers_state
.icount_time_shift
);
332 static int64_t cpu_get_ticks_locked(void)
334 int64_t ticks
= timers_state
.cpu_ticks_offset
;
335 if (timers_state
.cpu_ticks_enabled
) {
336 ticks
+= cpu_get_host_ticks();
339 if (timers_state
.cpu_ticks_prev
> ticks
) {
340 /* Non increasing ticks may happen if the host uses software suspend. */
341 timers_state
.cpu_ticks_offset
+= timers_state
.cpu_ticks_prev
- ticks
;
342 ticks
= timers_state
.cpu_ticks_prev
;
345 timers_state
.cpu_ticks_prev
= ticks
;
349 /* return the time elapsed in VM between vm_start and vm_stop. Unless
350 * icount is active, cpu_get_ticks() uses units of the host CPU cycle
353 int64_t cpu_get_ticks(void)
358 return cpu_get_icount();
361 qemu_spin_lock(&timers_state
.vm_clock_lock
);
362 ticks
= cpu_get_ticks_locked();
363 qemu_spin_unlock(&timers_state
.vm_clock_lock
);
367 static int64_t cpu_get_clock_locked(void)
371 time
= timers_state
.cpu_clock_offset
;
372 if (timers_state
.cpu_ticks_enabled
) {
379 /* Return the monotonic time elapsed in VM, i.e.,
380 * the time between vm_start and vm_stop
382 int64_t cpu_get_clock(void)
388 start
= seqlock_read_begin(&timers_state
.vm_clock_seqlock
);
389 ti
= cpu_get_clock_locked();
390 } while (seqlock_read_retry(&timers_state
.vm_clock_seqlock
, start
));
395 /* enable cpu_get_ticks()
396 * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
398 void cpu_enable_ticks(void)
400 seqlock_write_lock(&timers_state
.vm_clock_seqlock
,
401 &timers_state
.vm_clock_lock
);
402 if (!timers_state
.cpu_ticks_enabled
) {
403 timers_state
.cpu_ticks_offset
-= cpu_get_host_ticks();
404 timers_state
.cpu_clock_offset
-= get_clock();
405 timers_state
.cpu_ticks_enabled
= 1;
407 seqlock_write_unlock(&timers_state
.vm_clock_seqlock
,
408 &timers_state
.vm_clock_lock
);
411 /* disable cpu_get_ticks() : the clock is stopped. You must not call
412 * cpu_get_ticks() after that.
413 * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
415 void cpu_disable_ticks(void)
417 seqlock_write_lock(&timers_state
.vm_clock_seqlock
,
418 &timers_state
.vm_clock_lock
);
419 if (timers_state
.cpu_ticks_enabled
) {
420 timers_state
.cpu_ticks_offset
+= cpu_get_host_ticks();
421 timers_state
.cpu_clock_offset
= cpu_get_clock_locked();
422 timers_state
.cpu_ticks_enabled
= 0;
424 seqlock_write_unlock(&timers_state
.vm_clock_seqlock
,
425 &timers_state
.vm_clock_lock
);
428 /* Correlation between real and virtual time is always going to be
429 fairly approximate, so ignore small variation.
430 When the guest is idle real and virtual time will be aligned in
432 #define ICOUNT_WOBBLE (NANOSECONDS_PER_SECOND / 10)
434 static void icount_adjust(void)
440 /* Protected by TimersState mutex. */
441 static int64_t last_delta
;
443 /* If the VM is not running, then do nothing. */
444 if (!runstate_is_running()) {
448 seqlock_write_lock(&timers_state
.vm_clock_seqlock
,
449 &timers_state
.vm_clock_lock
);
450 cur_time
= cpu_get_clock_locked();
451 cur_icount
= cpu_get_icount_locked();
453 delta
= cur_icount
- cur_time
;
454 /* FIXME: This is a very crude algorithm, somewhat prone to oscillation. */
456 && last_delta
+ ICOUNT_WOBBLE
< delta
* 2
457 && timers_state
.icount_time_shift
> 0) {
458 /* The guest is getting too far ahead. Slow time down. */
459 atomic_set(&timers_state
.icount_time_shift
,
460 timers_state
.icount_time_shift
- 1);
463 && last_delta
- ICOUNT_WOBBLE
> delta
* 2
464 && timers_state
.icount_time_shift
< MAX_ICOUNT_SHIFT
) {
465 /* The guest is getting too far behind. Speed time up. */
466 atomic_set(&timers_state
.icount_time_shift
,
467 timers_state
.icount_time_shift
+ 1);
470 atomic_set_i64(&timers_state
.qemu_icount_bias
,
471 cur_icount
- (timers_state
.qemu_icount
472 << timers_state
.icount_time_shift
));
473 seqlock_write_unlock(&timers_state
.vm_clock_seqlock
,
474 &timers_state
.vm_clock_lock
);
477 static void icount_adjust_rt(void *opaque
)
479 timer_mod(timers_state
.icount_rt_timer
,
480 qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT
) + 1000);
484 static void icount_adjust_vm(void *opaque
)
486 timer_mod(timers_state
.icount_vm_timer
,
487 qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL
) +
488 NANOSECONDS_PER_SECOND
/ 10);
492 static int64_t qemu_icount_round(int64_t count
)
494 int shift
= atomic_read(&timers_state
.icount_time_shift
);
495 return (count
+ (1 << shift
) - 1) >> shift
;
498 static void icount_warp_rt(void)
503 /* The icount_warp_timer is rescheduled soon after vm_clock_warp_start
504 * changes from -1 to another value, so the race here is okay.
507 seq
= seqlock_read_begin(&timers_state
.vm_clock_seqlock
);
508 warp_start
= timers_state
.vm_clock_warp_start
;
509 } while (seqlock_read_retry(&timers_state
.vm_clock_seqlock
, seq
));
511 if (warp_start
== -1) {
515 seqlock_write_lock(&timers_state
.vm_clock_seqlock
,
516 &timers_state
.vm_clock_lock
);
517 if (runstate_is_running()) {
518 int64_t clock
= REPLAY_CLOCK_LOCKED(REPLAY_CLOCK_VIRTUAL_RT
,
519 cpu_get_clock_locked());
522 warp_delta
= clock
- timers_state
.vm_clock_warp_start
;
523 if (use_icount
== 2) {
525 * In adaptive mode, do not let QEMU_CLOCK_VIRTUAL run too
526 * far ahead of real time.
528 int64_t cur_icount
= cpu_get_icount_locked();
529 int64_t delta
= clock
- cur_icount
;
530 warp_delta
= MIN(warp_delta
, delta
);
532 atomic_set_i64(&timers_state
.qemu_icount_bias
,
533 timers_state
.qemu_icount_bias
+ warp_delta
);
535 timers_state
.vm_clock_warp_start
= -1;
536 seqlock_write_unlock(&timers_state
.vm_clock_seqlock
,
537 &timers_state
.vm_clock_lock
);
539 if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL
)) {
540 qemu_clock_notify(QEMU_CLOCK_VIRTUAL
);
544 static void icount_timer_cb(void *opaque
)
546 /* No need for a checkpoint because the timer already synchronizes
547 * with CHECKPOINT_CLOCK_VIRTUAL_RT.
552 void qtest_clock_warp(int64_t dest
)
554 int64_t clock
= qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL
);
555 AioContext
*aio_context
;
556 assert(qtest_enabled());
557 aio_context
= qemu_get_aio_context();
558 while (clock
< dest
) {
559 int64_t deadline
= qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL
);
560 int64_t warp
= qemu_soonest_timeout(dest
- clock
, deadline
);
562 seqlock_write_lock(&timers_state
.vm_clock_seqlock
,
563 &timers_state
.vm_clock_lock
);
564 atomic_set_i64(&timers_state
.qemu_icount_bias
,
565 timers_state
.qemu_icount_bias
+ warp
);
566 seqlock_write_unlock(&timers_state
.vm_clock_seqlock
,
567 &timers_state
.vm_clock_lock
);
569 qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL
);
570 timerlist_run_timers(aio_context
->tlg
.tl
[QEMU_CLOCK_VIRTUAL
]);
571 clock
= qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL
);
573 qemu_clock_notify(QEMU_CLOCK_VIRTUAL
);
576 void qemu_start_warp_timer(void)
585 /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
586 * do not fire, so computing the deadline does not make sense.
588 if (!runstate_is_running()) {
592 if (replay_mode
!= REPLAY_MODE_PLAY
) {
593 if (!all_cpu_threads_idle()) {
597 if (qtest_enabled()) {
598 /* When testing, qtest commands advance icount. */
602 replay_checkpoint(CHECKPOINT_CLOCK_WARP_START
);
604 /* warp clock deterministically in record/replay mode */
605 if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_START
)) {
606 /* vCPU is sleeping and warp can't be started.
607 It is probably a race condition: notification sent
608 to vCPU was processed in advance and vCPU went to sleep.
609 Therefore we have to wake it up for doing someting. */
610 if (replay_has_checkpoint()) {
611 qemu_clock_notify(QEMU_CLOCK_VIRTUAL
);
617 /* We want to use the earliest deadline from ALL vm_clocks */
618 clock
= qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT
);
619 deadline
= qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL
);
621 static bool notified
;
622 if (!icount_sleep
&& !notified
) {
623 warn_report("icount sleep disabled and no active timers");
631 * Ensure QEMU_CLOCK_VIRTUAL proceeds even when the virtual CPU goes to
632 * sleep. Otherwise, the CPU might be waiting for a future timer
633 * interrupt to wake it up, but the interrupt never comes because
634 * the vCPU isn't running any insns and thus doesn't advance the
635 * QEMU_CLOCK_VIRTUAL.
639 * We never let VCPUs sleep in no sleep icount mode.
640 * If there is a pending QEMU_CLOCK_VIRTUAL timer we just advance
641 * to the next QEMU_CLOCK_VIRTUAL event and notify it.
642 * It is useful when we want a deterministic execution time,
643 * isolated from host latencies.
645 seqlock_write_lock(&timers_state
.vm_clock_seqlock
,
646 &timers_state
.vm_clock_lock
);
647 atomic_set_i64(&timers_state
.qemu_icount_bias
,
648 timers_state
.qemu_icount_bias
+ deadline
);
649 seqlock_write_unlock(&timers_state
.vm_clock_seqlock
,
650 &timers_state
.vm_clock_lock
);
651 qemu_clock_notify(QEMU_CLOCK_VIRTUAL
);
654 * We do stop VCPUs and only advance QEMU_CLOCK_VIRTUAL after some
655 * "real" time, (related to the time left until the next event) has
656 * passed. The QEMU_CLOCK_VIRTUAL_RT clock will do this.
657 * This avoids that the warps are visible externally; for example,
658 * you will not be sending network packets continuously instead of
661 seqlock_write_lock(&timers_state
.vm_clock_seqlock
,
662 &timers_state
.vm_clock_lock
);
663 if (timers_state
.vm_clock_warp_start
== -1
664 || timers_state
.vm_clock_warp_start
> clock
) {
665 timers_state
.vm_clock_warp_start
= clock
;
667 seqlock_write_unlock(&timers_state
.vm_clock_seqlock
,
668 &timers_state
.vm_clock_lock
);
669 timer_mod_anticipate(timers_state
.icount_warp_timer
,
672 } else if (deadline
== 0) {
673 qemu_clock_notify(QEMU_CLOCK_VIRTUAL
);
677 static void qemu_account_warp_timer(void)
679 if (!use_icount
|| !icount_sleep
) {
683 /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
684 * do not fire, so computing the deadline does not make sense.
686 if (!runstate_is_running()) {
690 /* warp clock deterministically in record/replay mode */
691 if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_ACCOUNT
)) {
695 timer_del(timers_state
.icount_warp_timer
);
699 static bool icount_state_needed(void *opaque
)
704 static bool warp_timer_state_needed(void *opaque
)
706 TimersState
*s
= opaque
;
707 return s
->icount_warp_timer
!= NULL
;
710 static bool adjust_timers_state_needed(void *opaque
)
712 TimersState
*s
= opaque
;
713 return s
->icount_rt_timer
!= NULL
;
717 * Subsection for warp timer migration is optional, because may not be created
719 static const VMStateDescription icount_vmstate_warp_timer
= {
720 .name
= "timer/icount/warp_timer",
722 .minimum_version_id
= 1,
723 .needed
= warp_timer_state_needed
,
724 .fields
= (VMStateField
[]) {
725 VMSTATE_INT64(vm_clock_warp_start
, TimersState
),
726 VMSTATE_TIMER_PTR(icount_warp_timer
, TimersState
),
727 VMSTATE_END_OF_LIST()
731 static const VMStateDescription icount_vmstate_adjust_timers
= {
732 .name
= "timer/icount/timers",
734 .minimum_version_id
= 1,
735 .needed
= adjust_timers_state_needed
,
736 .fields
= (VMStateField
[]) {
737 VMSTATE_TIMER_PTR(icount_rt_timer
, TimersState
),
738 VMSTATE_TIMER_PTR(icount_vm_timer
, TimersState
),
739 VMSTATE_END_OF_LIST()
744 * This is a subsection for icount migration.
746 static const VMStateDescription icount_vmstate_timers
= {
747 .name
= "timer/icount",
749 .minimum_version_id
= 1,
750 .needed
= icount_state_needed
,
751 .fields
= (VMStateField
[]) {
752 VMSTATE_INT64(qemu_icount_bias
, TimersState
),
753 VMSTATE_INT64(qemu_icount
, TimersState
),
754 VMSTATE_END_OF_LIST()
756 .subsections
= (const VMStateDescription
*[]) {
757 &icount_vmstate_warp_timer
,
758 &icount_vmstate_adjust_timers
,
763 static const VMStateDescription vmstate_timers
= {
766 .minimum_version_id
= 1,
767 .fields
= (VMStateField
[]) {
768 VMSTATE_INT64(cpu_ticks_offset
, TimersState
),
770 VMSTATE_INT64_V(cpu_clock_offset
, TimersState
, 2),
771 VMSTATE_END_OF_LIST()
773 .subsections
= (const VMStateDescription
*[]) {
774 &icount_vmstate_timers
,
779 static void cpu_throttle_thread(CPUState
*cpu
, run_on_cpu_data opaque
)
782 double throttle_ratio
;
785 if (!cpu_throttle_get_percentage()) {
789 pct
= (double)cpu_throttle_get_percentage()/100;
790 throttle_ratio
= pct
/ (1 - pct
);
791 sleeptime_ns
= (long)(throttle_ratio
* CPU_THROTTLE_TIMESLICE_NS
);
793 qemu_mutex_unlock_iothread();
794 g_usleep(sleeptime_ns
/ 1000); /* Convert ns to us for usleep call */
795 qemu_mutex_lock_iothread();
796 atomic_set(&cpu
->throttle_thread_scheduled
, 0);
799 static void cpu_throttle_timer_tick(void *opaque
)
804 /* Stop the timer if needed */
805 if (!cpu_throttle_get_percentage()) {
809 if (!atomic_xchg(&cpu
->throttle_thread_scheduled
, 1)) {
810 async_run_on_cpu(cpu
, cpu_throttle_thread
,
815 pct
= (double)cpu_throttle_get_percentage()/100;
816 timer_mod(throttle_timer
, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT
) +
817 CPU_THROTTLE_TIMESLICE_NS
/ (1-pct
));
820 void cpu_throttle_set(int new_throttle_pct
)
822 /* Ensure throttle percentage is within valid range */
823 new_throttle_pct
= MIN(new_throttle_pct
, CPU_THROTTLE_PCT_MAX
);
824 new_throttle_pct
= MAX(new_throttle_pct
, CPU_THROTTLE_PCT_MIN
);
826 atomic_set(&throttle_percentage
, new_throttle_pct
);
828 timer_mod(throttle_timer
, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT
) +
829 CPU_THROTTLE_TIMESLICE_NS
);
832 void cpu_throttle_stop(void)
834 atomic_set(&throttle_percentage
, 0);
837 bool cpu_throttle_active(void)
839 return (cpu_throttle_get_percentage() != 0);
842 int cpu_throttle_get_percentage(void)
844 return atomic_read(&throttle_percentage
);
847 void cpu_ticks_init(void)
849 seqlock_init(&timers_state
.vm_clock_seqlock
);
850 qemu_spin_init(&timers_state
.vm_clock_lock
);
851 vmstate_register(NULL
, 0, &vmstate_timers
, &timers_state
);
852 throttle_timer
= timer_new_ns(QEMU_CLOCK_VIRTUAL_RT
,
853 cpu_throttle_timer_tick
, NULL
);
856 void configure_icount(QemuOpts
*opts
, Error
**errp
)
859 char *rem_str
= NULL
;
861 option
= qemu_opt_get(opts
, "shift");
863 if (qemu_opt_get(opts
, "align") != NULL
) {
864 error_setg(errp
, "Please specify shift option when using align");
869 icount_sleep
= qemu_opt_get_bool(opts
, "sleep", true);
871 timers_state
.icount_warp_timer
= timer_new_ns(QEMU_CLOCK_VIRTUAL_RT
,
872 icount_timer_cb
, NULL
);
875 icount_align_option
= qemu_opt_get_bool(opts
, "align", false);
877 if (icount_align_option
&& !icount_sleep
) {
878 error_setg(errp
, "align=on and sleep=off are incompatible");
880 if (strcmp(option
, "auto") != 0) {
882 timers_state
.icount_time_shift
= strtol(option
, &rem_str
, 0);
883 if (errno
!= 0 || *rem_str
!= '\0' || !strlen(option
)) {
884 error_setg(errp
, "icount: Invalid shift value");
888 } else if (icount_align_option
) {
889 error_setg(errp
, "shift=auto and align=on are incompatible");
890 } else if (!icount_sleep
) {
891 error_setg(errp
, "shift=auto and sleep=off are incompatible");
896 /* 125MIPS seems a reasonable initial guess at the guest speed.
897 It will be corrected fairly quickly anyway. */
898 timers_state
.icount_time_shift
= 3;
900 /* Have both realtime and virtual time triggers for speed adjustment.
901 The realtime trigger catches emulated time passing too slowly,
902 the virtual time trigger catches emulated time passing too fast.
903 Realtime triggers occur even when idle, so use them less frequently
905 timers_state
.vm_clock_warp_start
= -1;
906 timers_state
.icount_rt_timer
= timer_new_ms(QEMU_CLOCK_VIRTUAL_RT
,
907 icount_adjust_rt
, NULL
);
908 timer_mod(timers_state
.icount_rt_timer
,
909 qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT
) + 1000);
910 timers_state
.icount_vm_timer
= timer_new_ns(QEMU_CLOCK_VIRTUAL
,
911 icount_adjust_vm
, NULL
);
912 timer_mod(timers_state
.icount_vm_timer
,
913 qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL
) +
914 NANOSECONDS_PER_SECOND
/ 10);
917 /***********************************************************/
918 /* TCG vCPU kick timer
920 * The kick timer is responsible for moving single threaded vCPU
921 * emulation on to the next vCPU. If more than one vCPU is running a
922 * timer event with force a cpu->exit so the next vCPU can get
925 * The timer is removed if all vCPUs are idle and restarted again once
926 * idleness is complete.
929 static QEMUTimer
*tcg_kick_vcpu_timer
;
930 static CPUState
*tcg_current_rr_cpu
;
932 #define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
934 static inline int64_t qemu_tcg_next_kick(void)
936 return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL
) + TCG_KICK_PERIOD
;
939 /* Kick the currently round-robin scheduled vCPU */
940 static void qemu_cpu_kick_rr_cpu(void)
944 cpu
= atomic_mb_read(&tcg_current_rr_cpu
);
948 } while (cpu
!= atomic_mb_read(&tcg_current_rr_cpu
));
951 static void do_nothing(CPUState
*cpu
, run_on_cpu_data unused
)
955 void qemu_timer_notify_cb(void *opaque
, QEMUClockType type
)
957 if (!use_icount
|| type
!= QEMU_CLOCK_VIRTUAL
) {
962 if (qemu_in_vcpu_thread()) {
963 /* A CPU is currently running; kick it back out to the
964 * tcg_cpu_exec() loop so it will recalculate its
965 * icount deadline immediately.
967 qemu_cpu_kick(current_cpu
);
968 } else if (first_cpu
) {
969 /* qemu_cpu_kick is not enough to kick a halted CPU out of
970 * qemu_tcg_wait_io_event. async_run_on_cpu, instead,
971 * causes cpu_thread_is_idle to return false. This way,
972 * handle_icount_deadline can run.
973 * If we have no CPUs at all for some reason, we don't
974 * need to do anything.
976 async_run_on_cpu(first_cpu
, do_nothing
, RUN_ON_CPU_NULL
);
980 static void kick_tcg_thread(void *opaque
)
982 timer_mod(tcg_kick_vcpu_timer
, qemu_tcg_next_kick());
983 qemu_cpu_kick_rr_cpu();
986 static void start_tcg_kick_timer(void)
988 assert(!mttcg_enabled
);
989 if (!tcg_kick_vcpu_timer
&& CPU_NEXT(first_cpu
)) {
990 tcg_kick_vcpu_timer
= timer_new_ns(QEMU_CLOCK_VIRTUAL
,
991 kick_tcg_thread
, NULL
);
993 if (tcg_kick_vcpu_timer
&& !timer_pending(tcg_kick_vcpu_timer
)) {
994 timer_mod(tcg_kick_vcpu_timer
, qemu_tcg_next_kick());
998 static void stop_tcg_kick_timer(void)
1000 assert(!mttcg_enabled
);
1001 if (tcg_kick_vcpu_timer
&& timer_pending(tcg_kick_vcpu_timer
)) {
1002 timer_del(tcg_kick_vcpu_timer
);
1006 /***********************************************************/
1007 void hw_error(const char *fmt
, ...)
1013 fprintf(stderr
, "qemu: hardware error: ");
1014 vfprintf(stderr
, fmt
, ap
);
1015 fprintf(stderr
, "\n");
1017 fprintf(stderr
, "CPU #%d:\n", cpu
->cpu_index
);
1018 cpu_dump_state(cpu
, stderr
, CPU_DUMP_FPU
);
1024 void cpu_synchronize_all_states(void)
1029 cpu_synchronize_state(cpu
);
1030 /* TODO: move to cpu_synchronize_state() */
1031 if (hvf_enabled()) {
1032 hvf_cpu_synchronize_state(cpu
);
1037 void cpu_synchronize_all_post_reset(void)
1042 cpu_synchronize_post_reset(cpu
);
1043 /* TODO: move to cpu_synchronize_post_reset() */
1044 if (hvf_enabled()) {
1045 hvf_cpu_synchronize_post_reset(cpu
);
1050 void cpu_synchronize_all_post_init(void)
1055 cpu_synchronize_post_init(cpu
);
1056 /* TODO: move to cpu_synchronize_post_init() */
1057 if (hvf_enabled()) {
1058 hvf_cpu_synchronize_post_init(cpu
);
1063 void cpu_synchronize_all_pre_loadvm(void)
1068 cpu_synchronize_pre_loadvm(cpu
);
1072 static int do_vm_stop(RunState state
, bool send_stop
)
1076 if (runstate_is_running()) {
1077 cpu_disable_ticks();
1079 runstate_set(state
);
1080 vm_state_notify(0, state
);
1082 qapi_event_send_stop();
1087 replay_disable_events();
1088 ret
= bdrv_flush_all();
1093 /* Special vm_stop() variant for terminating the process. Historically clients
1094 * did not expect a QMP STOP event and so we need to retain compatibility.
1096 int vm_shutdown(void)
1098 return do_vm_stop(RUN_STATE_SHUTDOWN
, false);
1101 static bool cpu_can_run(CPUState
*cpu
)
1106 if (cpu_is_stopped(cpu
)) {
1112 static void cpu_handle_guest_debug(CPUState
*cpu
)
1114 gdb_set_stop_cpu(cpu
);
1115 qemu_system_debug_request();
1116 cpu
->stopped
= true;
1120 static void sigbus_reraise(void)
1123 struct sigaction action
;
1125 memset(&action
, 0, sizeof(action
));
1126 action
.sa_handler
= SIG_DFL
;
1127 if (!sigaction(SIGBUS
, &action
, NULL
)) {
1130 sigaddset(&set
, SIGBUS
);
1131 pthread_sigmask(SIG_UNBLOCK
, &set
, NULL
);
1133 perror("Failed to re-raise SIGBUS!\n");
1137 static void sigbus_handler(int n
, siginfo_t
*siginfo
, void *ctx
)
1139 if (siginfo
->si_code
!= BUS_MCEERR_AO
&& siginfo
->si_code
!= BUS_MCEERR_AR
) {
1144 /* Called asynchronously in VCPU thread. */
1145 if (kvm_on_sigbus_vcpu(current_cpu
, siginfo
->si_code
, siginfo
->si_addr
)) {
1149 /* Called synchronously (via signalfd) in main thread. */
1150 if (kvm_on_sigbus(siginfo
->si_code
, siginfo
->si_addr
)) {
1156 static void qemu_init_sigbus(void)
1158 struct sigaction action
;
1160 memset(&action
, 0, sizeof(action
));
1161 action
.sa_flags
= SA_SIGINFO
;
1162 action
.sa_sigaction
= sigbus_handler
;
1163 sigaction(SIGBUS
, &action
, NULL
);
1165 prctl(PR_MCE_KILL
, PR_MCE_KILL_SET
, PR_MCE_KILL_EARLY
, 0, 0);
1167 #else /* !CONFIG_LINUX */
1168 static void qemu_init_sigbus(void)
1171 #endif /* !CONFIG_LINUX */
1173 static QemuMutex qemu_global_mutex
;
1175 static QemuThread io_thread
;
1178 static QemuCond qemu_cpu_cond
;
1180 static QemuCond qemu_pause_cond
;
1182 void qemu_init_cpu_loop(void)
1185 qemu_cond_init(&qemu_cpu_cond
);
1186 qemu_cond_init(&qemu_pause_cond
);
1187 qemu_mutex_init(&qemu_global_mutex
);
1189 qemu_thread_get_self(&io_thread
);
1192 void run_on_cpu(CPUState
*cpu
, run_on_cpu_func func
, run_on_cpu_data data
)
1194 do_run_on_cpu(cpu
, func
, data
, &qemu_global_mutex
);
1197 static void qemu_kvm_destroy_vcpu(CPUState
*cpu
)
1199 if (kvm_destroy_vcpu(cpu
) < 0) {
1200 error_report("kvm_destroy_vcpu failed");
1205 static void qemu_tcg_destroy_vcpu(CPUState
*cpu
)
1209 static void qemu_cpu_stop(CPUState
*cpu
, bool exit
)
1211 g_assert(qemu_cpu_is_self(cpu
));
1213 cpu
->stopped
= true;
1217 qemu_cond_broadcast(&qemu_pause_cond
);
1220 static void qemu_wait_io_event_common(CPUState
*cpu
)
1222 atomic_mb_set(&cpu
->thread_kicked
, false);
1224 qemu_cpu_stop(cpu
, false);
1226 process_queued_cpu_work(cpu
);
1229 static void qemu_tcg_rr_wait_io_event(void)
1233 while (all_cpu_threads_idle()) {
1234 stop_tcg_kick_timer();
1235 qemu_cond_wait(first_cpu
->halt_cond
, &qemu_global_mutex
);
1238 start_tcg_kick_timer();
1241 qemu_wait_io_event_common(cpu
);
1245 static void qemu_wait_io_event(CPUState
*cpu
)
1247 while (cpu_thread_is_idle(cpu
)) {
1248 qemu_cond_wait(cpu
->halt_cond
, &qemu_global_mutex
);
1252 /* Eat dummy APC queued by qemu_cpu_kick_thread. */
1253 if (!tcg_enabled()) {
1257 qemu_wait_io_event_common(cpu
);
1260 static void *qemu_kvm_cpu_thread_fn(void *arg
)
1262 CPUState
*cpu
= arg
;
1265 rcu_register_thread();
1267 qemu_mutex_lock_iothread();
1268 qemu_thread_get_self(cpu
->thread
);
1269 cpu
->thread_id
= qemu_get_thread_id();
1273 r
= kvm_init_vcpu(cpu
);
1275 error_report("kvm_init_vcpu failed: %s", strerror(-r
));
1279 kvm_init_cpu_signals(cpu
);
1281 /* signal CPU creation */
1282 cpu
->created
= true;
1283 qemu_cond_signal(&qemu_cpu_cond
);
1284 qemu_guest_random_seed_thread_part2(cpu
->random_seed
);
1287 if (cpu_can_run(cpu
)) {
1288 r
= kvm_cpu_exec(cpu
);
1289 if (r
== EXCP_DEBUG
) {
1290 cpu_handle_guest_debug(cpu
);
1293 qemu_wait_io_event(cpu
);
1294 } while (!cpu
->unplug
|| cpu_can_run(cpu
));
1296 qemu_kvm_destroy_vcpu(cpu
);
1297 cpu
->created
= false;
1298 qemu_cond_signal(&qemu_cpu_cond
);
1299 qemu_mutex_unlock_iothread();
1300 rcu_unregister_thread();
1304 static void *qemu_dummy_cpu_thread_fn(void *arg
)
1307 error_report("qtest is not supported under Windows");
1310 CPUState
*cpu
= arg
;
1314 rcu_register_thread();
1316 qemu_mutex_lock_iothread();
1317 qemu_thread_get_self(cpu
->thread
);
1318 cpu
->thread_id
= qemu_get_thread_id();
1322 sigemptyset(&waitset
);
1323 sigaddset(&waitset
, SIG_IPI
);
1325 /* signal CPU creation */
1326 cpu
->created
= true;
1327 qemu_cond_signal(&qemu_cpu_cond
);
1328 qemu_guest_random_seed_thread_part2(cpu
->random_seed
);
1331 qemu_mutex_unlock_iothread();
1334 r
= sigwait(&waitset
, &sig
);
1335 } while (r
== -1 && (errno
== EAGAIN
|| errno
== EINTR
));
1340 qemu_mutex_lock_iothread();
1341 qemu_wait_io_event(cpu
);
1342 } while (!cpu
->unplug
);
1344 qemu_mutex_unlock_iothread();
1345 rcu_unregister_thread();
1350 static int64_t tcg_get_icount_limit(void)
1354 if (replay_mode
!= REPLAY_MODE_PLAY
) {
1355 deadline
= qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL
);
1357 /* Maintain prior (possibly buggy) behaviour where if no deadline
1358 * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
1359 * INT32_MAX nanoseconds ahead, we still use INT32_MAX
1362 if ((deadline
< 0) || (deadline
> INT32_MAX
)) {
1363 deadline
= INT32_MAX
;
1366 return qemu_icount_round(deadline
);
1368 return replay_get_instructions();
1372 static void handle_icount_deadline(void)
1374 assert(qemu_in_vcpu_thread());
1377 qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL
);
1379 if (deadline
== 0) {
1380 /* Wake up other AioContexts. */
1381 qemu_clock_notify(QEMU_CLOCK_VIRTUAL
);
1382 qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL
);
1387 static void prepare_icount_for_run(CPUState
*cpu
)
1392 /* These should always be cleared by process_icount_data after
1393 * each vCPU execution. However u16.high can be raised
1394 * asynchronously by cpu_exit/cpu_interrupt/tcg_handle_interrupt
1396 g_assert(cpu_neg(cpu
)->icount_decr
.u16
.low
== 0);
1397 g_assert(cpu
->icount_extra
== 0);
1399 cpu
->icount_budget
= tcg_get_icount_limit();
1400 insns_left
= MIN(0xffff, cpu
->icount_budget
);
1401 cpu_neg(cpu
)->icount_decr
.u16
.low
= insns_left
;
1402 cpu
->icount_extra
= cpu
->icount_budget
- insns_left
;
1404 replay_mutex_lock();
1408 static void process_icount_data(CPUState
*cpu
)
1411 /* Account for executed instructions */
1412 cpu_update_icount(cpu
);
1414 /* Reset the counters */
1415 cpu_neg(cpu
)->icount_decr
.u16
.low
= 0;
1416 cpu
->icount_extra
= 0;
1417 cpu
->icount_budget
= 0;
1419 replay_account_executed_instructions();
1421 replay_mutex_unlock();
1426 static int tcg_cpu_exec(CPUState
*cpu
)
1429 #ifdef CONFIG_PROFILER
1433 assert(tcg_enabled());
1434 #ifdef CONFIG_PROFILER
1435 ti
= profile_getclock();
1437 cpu_exec_start(cpu
);
1438 ret
= cpu_exec(cpu
);
1440 #ifdef CONFIG_PROFILER
1441 atomic_set(&tcg_ctx
->prof
.cpu_exec_time
,
1442 tcg_ctx
->prof
.cpu_exec_time
+ profile_getclock() - ti
);
1447 /* Destroy any remaining vCPUs which have been unplugged and have
1450 static void deal_with_unplugged_cpus(void)
1455 if (cpu
->unplug
&& !cpu_can_run(cpu
)) {
1456 qemu_tcg_destroy_vcpu(cpu
);
1457 cpu
->created
= false;
1458 qemu_cond_signal(&qemu_cpu_cond
);
1464 /* Single-threaded TCG
1466 * In the single-threaded case each vCPU is simulated in turn. If
1467 * there is more than a single vCPU we create a simple timer to kick
1468 * the vCPU and ensure we don't get stuck in a tight loop in one vCPU.
1469 * This is done explicitly rather than relying on side-effects
1473 static void *qemu_tcg_rr_cpu_thread_fn(void *arg
)
1475 CPUState
*cpu
= arg
;
1477 assert(tcg_enabled());
1478 rcu_register_thread();
1479 tcg_register_thread();
1481 qemu_mutex_lock_iothread();
1482 qemu_thread_get_self(cpu
->thread
);
1484 cpu
->thread_id
= qemu_get_thread_id();
1485 cpu
->created
= true;
1487 qemu_cond_signal(&qemu_cpu_cond
);
1488 qemu_guest_random_seed_thread_part2(cpu
->random_seed
);
1490 /* wait for initial kick-off after machine start */
1491 while (first_cpu
->stopped
) {
1492 qemu_cond_wait(first_cpu
->halt_cond
, &qemu_global_mutex
);
1494 /* process any pending work */
1497 qemu_wait_io_event_common(cpu
);
1501 start_tcg_kick_timer();
1505 /* process any pending work */
1506 cpu
->exit_request
= 1;
1509 qemu_mutex_unlock_iothread();
1510 replay_mutex_lock();
1511 qemu_mutex_lock_iothread();
1512 /* Account partial waits to QEMU_CLOCK_VIRTUAL. */
1513 qemu_account_warp_timer();
1515 /* Run the timers here. This is much more efficient than
1516 * waking up the I/O thread and waiting for completion.
1518 handle_icount_deadline();
1520 replay_mutex_unlock();
1526 while (cpu
&& !cpu
->queued_work_first
&& !cpu
->exit_request
) {
1528 atomic_mb_set(&tcg_current_rr_cpu
, cpu
);
1531 qemu_clock_enable(QEMU_CLOCK_VIRTUAL
,
1532 (cpu
->singlestep_enabled
& SSTEP_NOTIMER
) == 0);
1534 if (cpu_can_run(cpu
)) {
1537 qemu_mutex_unlock_iothread();
1538 prepare_icount_for_run(cpu
);
1540 r
= tcg_cpu_exec(cpu
);
1542 process_icount_data(cpu
);
1543 qemu_mutex_lock_iothread();
1545 if (r
== EXCP_DEBUG
) {
1546 cpu_handle_guest_debug(cpu
);
1548 } else if (r
== EXCP_ATOMIC
) {
1549 qemu_mutex_unlock_iothread();
1550 cpu_exec_step_atomic(cpu
);
1551 qemu_mutex_lock_iothread();
1554 } else if (cpu
->stop
) {
1556 cpu
= CPU_NEXT(cpu
);
1561 cpu
= CPU_NEXT(cpu
);
1562 } /* while (cpu && !cpu->exit_request).. */
1564 /* Does not need atomic_mb_set because a spurious wakeup is okay. */
1565 atomic_set(&tcg_current_rr_cpu
, NULL
);
1567 if (cpu
&& cpu
->exit_request
) {
1568 atomic_mb_set(&cpu
->exit_request
, 0);
1571 if (use_icount
&& all_cpu_threads_idle()) {
1573 * When all cpus are sleeping (e.g in WFI), to avoid a deadlock
1574 * in the main_loop, wake it up in order to start the warp timer.
1576 qemu_notify_event();
1579 qemu_tcg_rr_wait_io_event();
1580 deal_with_unplugged_cpus();
1583 rcu_unregister_thread();
1587 static void *qemu_hax_cpu_thread_fn(void *arg
)
1589 CPUState
*cpu
= arg
;
1592 rcu_register_thread();
1593 qemu_mutex_lock_iothread();
1594 qemu_thread_get_self(cpu
->thread
);
1596 cpu
->thread_id
= qemu_get_thread_id();
1597 cpu
->created
= true;
1601 qemu_cond_signal(&qemu_cpu_cond
);
1602 qemu_guest_random_seed_thread_part2(cpu
->random_seed
);
1605 if (cpu_can_run(cpu
)) {
1606 r
= hax_smp_cpu_exec(cpu
);
1607 if (r
== EXCP_DEBUG
) {
1608 cpu_handle_guest_debug(cpu
);
1612 qemu_wait_io_event(cpu
);
1613 } while (!cpu
->unplug
|| cpu_can_run(cpu
));
1614 rcu_unregister_thread();
1618 /* The HVF-specific vCPU thread function. This one should only run when the host
1619 * CPU supports the VMX "unrestricted guest" feature. */
1620 static void *qemu_hvf_cpu_thread_fn(void *arg
)
1622 CPUState
*cpu
= arg
;
1626 assert(hvf_enabled());
1628 rcu_register_thread();
1630 qemu_mutex_lock_iothread();
1631 qemu_thread_get_self(cpu
->thread
);
1633 cpu
->thread_id
= qemu_get_thread_id();
1639 /* signal CPU creation */
1640 cpu
->created
= true;
1641 qemu_cond_signal(&qemu_cpu_cond
);
1642 qemu_guest_random_seed_thread_part2(cpu
->random_seed
);
1645 if (cpu_can_run(cpu
)) {
1646 r
= hvf_vcpu_exec(cpu
);
1647 if (r
== EXCP_DEBUG
) {
1648 cpu_handle_guest_debug(cpu
);
1651 qemu_wait_io_event(cpu
);
1652 } while (!cpu
->unplug
|| cpu_can_run(cpu
));
1654 hvf_vcpu_destroy(cpu
);
1655 cpu
->created
= false;
1656 qemu_cond_signal(&qemu_cpu_cond
);
1657 qemu_mutex_unlock_iothread();
1658 rcu_unregister_thread();
1662 static void *qemu_whpx_cpu_thread_fn(void *arg
)
1664 CPUState
*cpu
= arg
;
1667 rcu_register_thread();
1669 qemu_mutex_lock_iothread();
1670 qemu_thread_get_self(cpu
->thread
);
1671 cpu
->thread_id
= qemu_get_thread_id();
1674 r
= whpx_init_vcpu(cpu
);
1676 fprintf(stderr
, "whpx_init_vcpu failed: %s\n", strerror(-r
));
1680 /* signal CPU creation */
1681 cpu
->created
= true;
1682 qemu_cond_signal(&qemu_cpu_cond
);
1683 qemu_guest_random_seed_thread_part2(cpu
->random_seed
);
1686 if (cpu_can_run(cpu
)) {
1687 r
= whpx_vcpu_exec(cpu
);
1688 if (r
== EXCP_DEBUG
) {
1689 cpu_handle_guest_debug(cpu
);
1692 while (cpu_thread_is_idle(cpu
)) {
1693 qemu_cond_wait(cpu
->halt_cond
, &qemu_global_mutex
);
1695 qemu_wait_io_event_common(cpu
);
1696 } while (!cpu
->unplug
|| cpu_can_run(cpu
));
1698 whpx_destroy_vcpu(cpu
);
1699 cpu
->created
= false;
1700 qemu_cond_signal(&qemu_cpu_cond
);
1701 qemu_mutex_unlock_iothread();
1702 rcu_unregister_thread();
1707 static void CALLBACK
dummy_apc_func(ULONG_PTR unused
)
1712 /* Multi-threaded TCG
1714 * In the multi-threaded case each vCPU has its own thread. The TLS
1715 * variable current_cpu can be used deep in the code to find the
1716 * current CPUState for a given thread.
1719 static void *qemu_tcg_cpu_thread_fn(void *arg
)
1721 CPUState
*cpu
= arg
;
1723 assert(tcg_enabled());
1724 g_assert(!use_icount
);
1726 rcu_register_thread();
1727 tcg_register_thread();
1729 qemu_mutex_lock_iothread();
1730 qemu_thread_get_self(cpu
->thread
);
1732 cpu
->thread_id
= qemu_get_thread_id();
1733 cpu
->created
= true;
1736 qemu_cond_signal(&qemu_cpu_cond
);
1737 qemu_guest_random_seed_thread_part2(cpu
->random_seed
);
1739 /* process any pending work */
1740 cpu
->exit_request
= 1;
1743 if (cpu_can_run(cpu
)) {
1745 qemu_mutex_unlock_iothread();
1746 r
= tcg_cpu_exec(cpu
);
1747 qemu_mutex_lock_iothread();
1750 cpu_handle_guest_debug(cpu
);
1753 /* during start-up the vCPU is reset and the thread is
1754 * kicked several times. If we don't ensure we go back
1755 * to sleep in the halted state we won't cleanly
1756 * start-up when the vCPU is enabled.
1758 * cpu->halted should ensure we sleep in wait_io_event
1760 g_assert(cpu
->halted
);
1763 qemu_mutex_unlock_iothread();
1764 cpu_exec_step_atomic(cpu
);
1765 qemu_mutex_lock_iothread();
1767 /* Ignore everything else? */
1772 atomic_mb_set(&cpu
->exit_request
, 0);
1773 qemu_wait_io_event(cpu
);
1774 } while (!cpu
->unplug
|| cpu_can_run(cpu
));
1776 qemu_tcg_destroy_vcpu(cpu
);
1777 cpu
->created
= false;
1778 qemu_cond_signal(&qemu_cpu_cond
);
1779 qemu_mutex_unlock_iothread();
1780 rcu_unregister_thread();
1784 static void qemu_cpu_kick_thread(CPUState
*cpu
)
1789 if (cpu
->thread_kicked
) {
1792 cpu
->thread_kicked
= true;
1793 err
= pthread_kill(cpu
->thread
->thread
, SIG_IPI
);
1794 if (err
&& err
!= ESRCH
) {
1795 fprintf(stderr
, "qemu:%s: %s", __func__
, strerror(err
));
1799 if (!qemu_cpu_is_self(cpu
)) {
1800 if (whpx_enabled()) {
1801 whpx_vcpu_kick(cpu
);
1802 } else if (!QueueUserAPC(dummy_apc_func
, cpu
->hThread
, 0)) {
1803 fprintf(stderr
, "%s: QueueUserAPC failed with error %lu\n",
1804 __func__
, GetLastError());
1811 void qemu_cpu_kick(CPUState
*cpu
)
1813 qemu_cond_broadcast(cpu
->halt_cond
);
1814 if (tcg_enabled()) {
1816 /* NOP unless doing single-thread RR */
1817 qemu_cpu_kick_rr_cpu();
1819 if (hax_enabled()) {
1821 * FIXME: race condition with the exit_request check in
1824 cpu
->exit_request
= 1;
1826 qemu_cpu_kick_thread(cpu
);
1830 void qemu_cpu_kick_self(void)
1832 assert(current_cpu
);
1833 qemu_cpu_kick_thread(current_cpu
);
1836 bool qemu_cpu_is_self(CPUState
*cpu
)
1838 return qemu_thread_is_self(cpu
->thread
);
1841 bool qemu_in_vcpu_thread(void)
1843 return current_cpu
&& qemu_cpu_is_self(current_cpu
);
1846 static __thread
bool iothread_locked
= false;
1848 bool qemu_mutex_iothread_locked(void)
1850 return iothread_locked
;
1854 * The BQL is taken from so many places that it is worth profiling the
1855 * callers directly, instead of funneling them all through a single function.
1857 void qemu_mutex_lock_iothread_impl(const char *file
, int line
)
1859 QemuMutexLockFunc bql_lock
= atomic_read(&qemu_bql_mutex_lock_func
);
1861 g_assert(!qemu_mutex_iothread_locked());
1862 bql_lock(&qemu_global_mutex
, file
, line
);
1863 iothread_locked
= true;
1866 void qemu_mutex_unlock_iothread(void)
1868 g_assert(qemu_mutex_iothread_locked());
1869 iothread_locked
= false;
1870 qemu_mutex_unlock(&qemu_global_mutex
);
1873 static bool all_vcpus_paused(void)
1878 if (!cpu
->stopped
) {
1886 void pause_all_vcpus(void)
1890 qemu_clock_enable(QEMU_CLOCK_VIRTUAL
, false);
1892 if (qemu_cpu_is_self(cpu
)) {
1893 qemu_cpu_stop(cpu
, true);
1900 /* We need to drop the replay_lock so any vCPU threads woken up
1901 * can finish their replay tasks
1903 replay_mutex_unlock();
1905 while (!all_vcpus_paused()) {
1906 qemu_cond_wait(&qemu_pause_cond
, &qemu_global_mutex
);
1912 qemu_mutex_unlock_iothread();
1913 replay_mutex_lock();
1914 qemu_mutex_lock_iothread();
1917 void cpu_resume(CPUState
*cpu
)
1920 cpu
->stopped
= false;
1924 void resume_all_vcpus(void)
1928 qemu_clock_enable(QEMU_CLOCK_VIRTUAL
, true);
1934 void cpu_remove_sync(CPUState
*cpu
)
1939 qemu_mutex_unlock_iothread();
1940 qemu_thread_join(cpu
->thread
);
1941 qemu_mutex_lock_iothread();
1944 /* For temporary buffers for forming a name */
1945 #define VCPU_THREAD_NAME_SIZE 16
1947 static void qemu_tcg_init_vcpu(CPUState
*cpu
)
1949 char thread_name
[VCPU_THREAD_NAME_SIZE
];
1950 static QemuCond
*single_tcg_halt_cond
;
1951 static QemuThread
*single_tcg_cpu_thread
;
1952 static int tcg_region_inited
;
1954 assert(tcg_enabled());
1956 * Initialize TCG regions--once. Now is a good time, because:
1957 * (1) TCG's init context, prologue and target globals have been set up.
1958 * (2) qemu_tcg_mttcg_enabled() works now (TCG init code runs before the
1959 * -accel flag is processed, so the check doesn't work then).
1961 if (!tcg_region_inited
) {
1962 tcg_region_inited
= 1;
1966 if (qemu_tcg_mttcg_enabled() || !single_tcg_cpu_thread
) {
1967 cpu
->thread
= g_malloc0(sizeof(QemuThread
));
1968 cpu
->halt_cond
= g_malloc0(sizeof(QemuCond
));
1969 qemu_cond_init(cpu
->halt_cond
);
1971 if (qemu_tcg_mttcg_enabled()) {
1972 /* create a thread per vCPU with TCG (MTTCG) */
1973 parallel_cpus
= true;
1974 snprintf(thread_name
, VCPU_THREAD_NAME_SIZE
, "CPU %d/TCG",
1977 qemu_thread_create(cpu
->thread
, thread_name
, qemu_tcg_cpu_thread_fn
,
1978 cpu
, QEMU_THREAD_JOINABLE
);
1981 /* share a single thread for all cpus with TCG */
1982 snprintf(thread_name
, VCPU_THREAD_NAME_SIZE
, "ALL CPUs/TCG");
1983 qemu_thread_create(cpu
->thread
, thread_name
,
1984 qemu_tcg_rr_cpu_thread_fn
,
1985 cpu
, QEMU_THREAD_JOINABLE
);
1987 single_tcg_halt_cond
= cpu
->halt_cond
;
1988 single_tcg_cpu_thread
= cpu
->thread
;
1991 cpu
->hThread
= qemu_thread_get_handle(cpu
->thread
);
1994 /* For non-MTTCG cases we share the thread */
1995 cpu
->thread
= single_tcg_cpu_thread
;
1996 cpu
->halt_cond
= single_tcg_halt_cond
;
1997 cpu
->thread_id
= first_cpu
->thread_id
;
1999 cpu
->created
= true;
2003 static void qemu_hax_start_vcpu(CPUState
*cpu
)
2005 char thread_name
[VCPU_THREAD_NAME_SIZE
];
2007 cpu
->thread
= g_malloc0(sizeof(QemuThread
));
2008 cpu
->halt_cond
= g_malloc0(sizeof(QemuCond
));
2009 qemu_cond_init(cpu
->halt_cond
);
2011 snprintf(thread_name
, VCPU_THREAD_NAME_SIZE
, "CPU %d/HAX",
2013 qemu_thread_create(cpu
->thread
, thread_name
, qemu_hax_cpu_thread_fn
,
2014 cpu
, QEMU_THREAD_JOINABLE
);
2016 cpu
->hThread
= qemu_thread_get_handle(cpu
->thread
);
2020 static void qemu_kvm_start_vcpu(CPUState
*cpu
)
2022 char thread_name
[VCPU_THREAD_NAME_SIZE
];
2024 cpu
->thread
= g_malloc0(sizeof(QemuThread
));
2025 cpu
->halt_cond
= g_malloc0(sizeof(QemuCond
));
2026 qemu_cond_init(cpu
->halt_cond
);
2027 snprintf(thread_name
, VCPU_THREAD_NAME_SIZE
, "CPU %d/KVM",
2029 qemu_thread_create(cpu
->thread
, thread_name
, qemu_kvm_cpu_thread_fn
,
2030 cpu
, QEMU_THREAD_JOINABLE
);
2033 static void qemu_hvf_start_vcpu(CPUState
*cpu
)
2035 char thread_name
[VCPU_THREAD_NAME_SIZE
];
2037 /* HVF currently does not support TCG, and only runs in
2038 * unrestricted-guest mode. */
2039 assert(hvf_enabled());
2041 cpu
->thread
= g_malloc0(sizeof(QemuThread
));
2042 cpu
->halt_cond
= g_malloc0(sizeof(QemuCond
));
2043 qemu_cond_init(cpu
->halt_cond
);
2045 snprintf(thread_name
, VCPU_THREAD_NAME_SIZE
, "CPU %d/HVF",
2047 qemu_thread_create(cpu
->thread
, thread_name
, qemu_hvf_cpu_thread_fn
,
2048 cpu
, QEMU_THREAD_JOINABLE
);
2051 static void qemu_whpx_start_vcpu(CPUState
*cpu
)
2053 char thread_name
[VCPU_THREAD_NAME_SIZE
];
2055 cpu
->thread
= g_malloc0(sizeof(QemuThread
));
2056 cpu
->halt_cond
= g_malloc0(sizeof(QemuCond
));
2057 qemu_cond_init(cpu
->halt_cond
);
2058 snprintf(thread_name
, VCPU_THREAD_NAME_SIZE
, "CPU %d/WHPX",
2060 qemu_thread_create(cpu
->thread
, thread_name
, qemu_whpx_cpu_thread_fn
,
2061 cpu
, QEMU_THREAD_JOINABLE
);
2063 cpu
->hThread
= qemu_thread_get_handle(cpu
->thread
);
2067 static void qemu_dummy_start_vcpu(CPUState
*cpu
)
2069 char thread_name
[VCPU_THREAD_NAME_SIZE
];
2071 cpu
->thread
= g_malloc0(sizeof(QemuThread
));
2072 cpu
->halt_cond
= g_malloc0(sizeof(QemuCond
));
2073 qemu_cond_init(cpu
->halt_cond
);
2074 snprintf(thread_name
, VCPU_THREAD_NAME_SIZE
, "CPU %d/DUMMY",
2076 qemu_thread_create(cpu
->thread
, thread_name
, qemu_dummy_cpu_thread_fn
, cpu
,
2077 QEMU_THREAD_JOINABLE
);
2080 void qemu_init_vcpu(CPUState
*cpu
)
2082 MachineState
*ms
= MACHINE(qdev_get_machine());
2084 cpu
->nr_cores
= ms
->smp
.cores
;
2085 cpu
->nr_threads
= ms
->smp
.threads
;
2086 cpu
->stopped
= true;
2087 cpu
->random_seed
= qemu_guest_random_seed_thread_part1();
2090 /* If the target cpu hasn't set up any address spaces itself,
2091 * give it the default one.
2094 cpu_address_space_init(cpu
, 0, "cpu-memory", cpu
->memory
);
2097 if (kvm_enabled()) {
2098 qemu_kvm_start_vcpu(cpu
);
2099 } else if (hax_enabled()) {
2100 qemu_hax_start_vcpu(cpu
);
2101 } else if (hvf_enabled()) {
2102 qemu_hvf_start_vcpu(cpu
);
2103 } else if (tcg_enabled()) {
2104 qemu_tcg_init_vcpu(cpu
);
2105 } else if (whpx_enabled()) {
2106 qemu_whpx_start_vcpu(cpu
);
2108 qemu_dummy_start_vcpu(cpu
);
2111 while (!cpu
->created
) {
2112 qemu_cond_wait(&qemu_cpu_cond
, &qemu_global_mutex
);
2116 void cpu_stop_current(void)
2119 current_cpu
->stop
= true;
2120 cpu_exit(current_cpu
);
2124 int vm_stop(RunState state
)
2126 if (qemu_in_vcpu_thread()) {
2127 qemu_system_vmstop_request_prepare();
2128 qemu_system_vmstop_request(state
);
2130 * FIXME: should not return to device code in case
2131 * vm_stop() has been requested.
2137 return do_vm_stop(state
, true);
2141 * Prepare for (re)starting the VM.
2142 * Returns -1 if the vCPUs are not to be restarted (e.g. if they are already
2143 * running or in case of an error condition), 0 otherwise.
2145 int vm_prepare_start(void)
2149 qemu_vmstop_requested(&requested
);
2150 if (runstate_is_running() && requested
== RUN_STATE__MAX
) {
2154 /* Ensure that a STOP/RESUME pair of events is emitted if a
2155 * vmstop request was pending. The BLOCK_IO_ERROR event, for
2156 * example, according to documentation is always followed by
2159 if (runstate_is_running()) {
2160 qapi_event_send_stop();
2161 qapi_event_send_resume();
2165 /* We are sending this now, but the CPUs will be resumed shortly later */
2166 qapi_event_send_resume();
2168 replay_enable_events();
2170 runstate_set(RUN_STATE_RUNNING
);
2171 vm_state_notify(1, RUN_STATE_RUNNING
);
2177 if (!vm_prepare_start()) {
2182 /* does a state transition even if the VM is already stopped,
2183 current state is forgotten forever */
2184 int vm_stop_force_state(RunState state
)
2186 if (runstate_is_running()) {
2187 return vm_stop(state
);
2189 runstate_set(state
);
2192 /* Make sure to return an error if the flush in a previous vm_stop()
2194 return bdrv_flush_all();
2198 void list_cpus(const char *optarg
)
2200 /* XXX: implement xxx_cpu_list for targets that still miss it */
2201 #if defined(cpu_list)
2206 void qmp_memsave(int64_t addr
, int64_t size
, const char *filename
,
2207 bool has_cpu
, int64_t cpu_index
, Error
**errp
)
2213 int64_t orig_addr
= addr
, orig_size
= size
;
2219 cpu
= qemu_get_cpu(cpu_index
);
2221 error_setg(errp
, QERR_INVALID_PARAMETER_VALUE
, "cpu-index",
2226 f
= fopen(filename
, "wb");
2228 error_setg_file_open(errp
, errno
, filename
);
2236 if (cpu_memory_rw_debug(cpu
, addr
, buf
, l
, 0) != 0) {
2237 error_setg(errp
, "Invalid addr 0x%016" PRIx64
"/size %" PRId64
2238 " specified", orig_addr
, orig_size
);
2241 if (fwrite(buf
, 1, l
, f
) != l
) {
2242 error_setg(errp
, QERR_IO_ERROR
);
2253 void qmp_pmemsave(int64_t addr
, int64_t size
, const char *filename
,
2260 f
= fopen(filename
, "wb");
2262 error_setg_file_open(errp
, errno
, filename
);
2270 cpu_physical_memory_read(addr
, buf
, l
);
2271 if (fwrite(buf
, 1, l
, f
) != l
) {
2272 error_setg(errp
, QERR_IO_ERROR
);
2283 void qmp_inject_nmi(Error
**errp
)
2285 nmi_monitor_handle(monitor_get_cpu_index(), errp
);
2288 void dump_drift_info(void)
2294 qemu_printf("Host - Guest clock %"PRIi64
" ms\n",
2295 (cpu_get_clock() - cpu_get_icount())/SCALE_MS
);
2296 if (icount_align_option
) {
2297 qemu_printf("Max guest delay %"PRIi64
" ms\n",
2298 -max_delay
/ SCALE_MS
);
2299 qemu_printf("Max guest advance %"PRIi64
" ms\n",
2300 max_advance
/ SCALE_MS
);
2302 qemu_printf("Max guest delay NA\n");
2303 qemu_printf("Max guest advance NA\n");