intel-iommu: introduce Intel IOMMU (VT-d) emulation
[qemu.git] / cpus.c
blob2b5c0bd7c74c07a85ddaf8eda145ef4ad99c003c
1 /*
2 * QEMU System Emulator
4 * Copyright (c) 2003-2008 Fabrice Bellard
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
25 /* Needed early for CONFIG_BSD etc. */
26 #include "config-host.h"
28 #include "monitor/monitor.h"
29 #include "qapi/qmp/qerror.h"
30 #include "sysemu/sysemu.h"
31 #include "exec/gdbstub.h"
32 #include "sysemu/dma.h"
33 #include "sysemu/kvm.h"
34 #include "qmp-commands.h"
36 #include "qemu/thread.h"
37 #include "sysemu/cpus.h"
38 #include "sysemu/qtest.h"
39 #include "qemu/main-loop.h"
40 #include "qemu/bitmap.h"
41 #include "qemu/seqlock.h"
42 #include "qapi-event.h"
44 #ifndef _WIN32
45 #include "qemu/compatfd.h"
46 #endif
48 #ifdef CONFIG_LINUX
50 #include <sys/prctl.h>
52 #ifndef PR_MCE_KILL
53 #define PR_MCE_KILL 33
54 #endif
56 #ifndef PR_MCE_KILL_SET
57 #define PR_MCE_KILL_SET 1
58 #endif
60 #ifndef PR_MCE_KILL_EARLY
61 #define PR_MCE_KILL_EARLY 1
62 #endif
64 #endif /* CONFIG_LINUX */
66 static CPUState *next_cpu;
67 int64_t max_delay;
68 int64_t max_advance;
70 bool cpu_is_stopped(CPUState *cpu)
72 return cpu->stopped || !runstate_is_running();
75 static bool cpu_thread_is_idle(CPUState *cpu)
77 if (cpu->stop || cpu->queued_work_first) {
78 return false;
80 if (cpu_is_stopped(cpu)) {
81 return true;
83 if (!cpu->halted || cpu_has_work(cpu) ||
84 kvm_halt_in_kernel()) {
85 return false;
87 return true;
90 static bool all_cpu_threads_idle(void)
92 CPUState *cpu;
94 CPU_FOREACH(cpu) {
95 if (!cpu_thread_is_idle(cpu)) {
96 return false;
99 return true;
102 /***********************************************************/
103 /* guest cycle counter */
105 /* Protected by TimersState seqlock */
107 static int64_t vm_clock_warp_start = -1;
108 /* Conversion factor from emulated instructions to virtual clock ticks. */
109 static int icount_time_shift;
110 /* Arbitrarily pick 1MIPS as the minimum allowable speed. */
111 #define MAX_ICOUNT_SHIFT 10
113 static QEMUTimer *icount_rt_timer;
114 static QEMUTimer *icount_vm_timer;
115 static QEMUTimer *icount_warp_timer;
117 typedef struct TimersState {
118 /* Protected by BQL. */
119 int64_t cpu_ticks_prev;
120 int64_t cpu_ticks_offset;
122 /* cpu_clock_offset can be read out of BQL, so protect it with
123 * this lock.
125 QemuSeqLock vm_clock_seqlock;
126 int64_t cpu_clock_offset;
127 int32_t cpu_ticks_enabled;
128 int64_t dummy;
130 /* Compensate for varying guest execution speed. */
131 int64_t qemu_icount_bias;
132 /* Only written by TCG thread */
133 int64_t qemu_icount;
134 } TimersState;
136 static TimersState timers_state;
138 /* Return the virtual CPU time, based on the instruction counter. */
139 static int64_t cpu_get_icount_locked(void)
141 int64_t icount;
142 CPUState *cpu = current_cpu;
144 icount = timers_state.qemu_icount;
145 if (cpu) {
146 if (!cpu_can_do_io(cpu)) {
147 fprintf(stderr, "Bad clock read\n");
149 icount -= (cpu->icount_decr.u16.low + cpu->icount_extra);
151 return timers_state.qemu_icount_bias + cpu_icount_to_ns(icount);
154 int64_t cpu_get_icount(void)
156 int64_t icount;
157 unsigned start;
159 do {
160 start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
161 icount = cpu_get_icount_locked();
162 } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
164 return icount;
167 int64_t cpu_icount_to_ns(int64_t icount)
169 return icount << icount_time_shift;
172 /* return the host CPU cycle counter and handle stop/restart */
173 /* Caller must hold the BQL */
174 int64_t cpu_get_ticks(void)
176 int64_t ticks;
178 if (use_icount) {
179 return cpu_get_icount();
182 ticks = timers_state.cpu_ticks_offset;
183 if (timers_state.cpu_ticks_enabled) {
184 ticks += cpu_get_real_ticks();
187 if (timers_state.cpu_ticks_prev > ticks) {
188 /* Note: non increasing ticks may happen if the host uses
189 software suspend */
190 timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks;
191 ticks = timers_state.cpu_ticks_prev;
194 timers_state.cpu_ticks_prev = ticks;
195 return ticks;
198 static int64_t cpu_get_clock_locked(void)
200 int64_t ticks;
202 ticks = timers_state.cpu_clock_offset;
203 if (timers_state.cpu_ticks_enabled) {
204 ticks += get_clock();
207 return ticks;
210 /* return the host CPU monotonic timer and handle stop/restart */
211 int64_t cpu_get_clock(void)
213 int64_t ti;
214 unsigned start;
216 do {
217 start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
218 ti = cpu_get_clock_locked();
219 } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
221 return ti;
224 /* return the offset between the host clock and virtual CPU clock */
225 int64_t cpu_get_clock_offset(void)
227 int64_t ti;
228 unsigned start;
230 do {
231 start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
232 ti = timers_state.cpu_clock_offset;
233 if (!timers_state.cpu_ticks_enabled) {
234 ti -= get_clock();
236 } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
238 return -ti;
241 /* enable cpu_get_ticks()
242 * Caller must hold BQL which server as mutex for vm_clock_seqlock.
244 void cpu_enable_ticks(void)
246 /* Here, the really thing protected by seqlock is cpu_clock_offset. */
247 seqlock_write_lock(&timers_state.vm_clock_seqlock);
248 if (!timers_state.cpu_ticks_enabled) {
249 timers_state.cpu_ticks_offset -= cpu_get_real_ticks();
250 timers_state.cpu_clock_offset -= get_clock();
251 timers_state.cpu_ticks_enabled = 1;
253 seqlock_write_unlock(&timers_state.vm_clock_seqlock);
256 /* disable cpu_get_ticks() : the clock is stopped. You must not call
257 * cpu_get_ticks() after that.
258 * Caller must hold BQL which server as mutex for vm_clock_seqlock.
260 void cpu_disable_ticks(void)
262 /* Here, the really thing protected by seqlock is cpu_clock_offset. */
263 seqlock_write_lock(&timers_state.vm_clock_seqlock);
264 if (timers_state.cpu_ticks_enabled) {
265 timers_state.cpu_ticks_offset += cpu_get_real_ticks();
266 timers_state.cpu_clock_offset = cpu_get_clock_locked();
267 timers_state.cpu_ticks_enabled = 0;
269 seqlock_write_unlock(&timers_state.vm_clock_seqlock);
272 /* Correlation between real and virtual time is always going to be
273 fairly approximate, so ignore small variation.
274 When the guest is idle real and virtual time will be aligned in
275 the IO wait loop. */
276 #define ICOUNT_WOBBLE (get_ticks_per_sec() / 10)
278 static void icount_adjust(void)
280 int64_t cur_time;
281 int64_t cur_icount;
282 int64_t delta;
284 /* Protected by TimersState mutex. */
285 static int64_t last_delta;
287 /* If the VM is not running, then do nothing. */
288 if (!runstate_is_running()) {
289 return;
292 seqlock_write_lock(&timers_state.vm_clock_seqlock);
293 cur_time = cpu_get_clock_locked();
294 cur_icount = cpu_get_icount_locked();
296 delta = cur_icount - cur_time;
297 /* FIXME: This is a very crude algorithm, somewhat prone to oscillation. */
298 if (delta > 0
299 && last_delta + ICOUNT_WOBBLE < delta * 2
300 && icount_time_shift > 0) {
301 /* The guest is getting too far ahead. Slow time down. */
302 icount_time_shift--;
304 if (delta < 0
305 && last_delta - ICOUNT_WOBBLE > delta * 2
306 && icount_time_shift < MAX_ICOUNT_SHIFT) {
307 /* The guest is getting too far behind. Speed time up. */
308 icount_time_shift++;
310 last_delta = delta;
311 timers_state.qemu_icount_bias = cur_icount
312 - (timers_state.qemu_icount << icount_time_shift);
313 seqlock_write_unlock(&timers_state.vm_clock_seqlock);
316 static void icount_adjust_rt(void *opaque)
318 timer_mod(icount_rt_timer,
319 qemu_clock_get_ms(QEMU_CLOCK_REALTIME) + 1000);
320 icount_adjust();
323 static void icount_adjust_vm(void *opaque)
325 timer_mod(icount_vm_timer,
326 qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
327 get_ticks_per_sec() / 10);
328 icount_adjust();
331 static int64_t qemu_icount_round(int64_t count)
333 return (count + (1 << icount_time_shift) - 1) >> icount_time_shift;
336 static void icount_warp_rt(void *opaque)
338 /* The icount_warp_timer is rescheduled soon after vm_clock_warp_start
339 * changes from -1 to another value, so the race here is okay.
341 if (atomic_read(&vm_clock_warp_start) == -1) {
342 return;
345 seqlock_write_lock(&timers_state.vm_clock_seqlock);
346 if (runstate_is_running()) {
347 int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
348 int64_t warp_delta;
350 warp_delta = clock - vm_clock_warp_start;
351 if (use_icount == 2) {
353 * In adaptive mode, do not let QEMU_CLOCK_VIRTUAL run too
354 * far ahead of real time.
356 int64_t cur_time = cpu_get_clock_locked();
357 int64_t cur_icount = cpu_get_icount_locked();
358 int64_t delta = cur_time - cur_icount;
359 warp_delta = MIN(warp_delta, delta);
361 timers_state.qemu_icount_bias += warp_delta;
363 vm_clock_warp_start = -1;
364 seqlock_write_unlock(&timers_state.vm_clock_seqlock);
366 if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL)) {
367 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
371 void qtest_clock_warp(int64_t dest)
373 int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
374 assert(qtest_enabled());
375 while (clock < dest) {
376 int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
377 int64_t warp = qemu_soonest_timeout(dest - clock, deadline);
378 seqlock_write_lock(&timers_state.vm_clock_seqlock);
379 timers_state.qemu_icount_bias += warp;
380 seqlock_write_unlock(&timers_state.vm_clock_seqlock);
382 qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
383 clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
385 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
388 void qemu_clock_warp(QEMUClockType type)
390 int64_t clock;
391 int64_t deadline;
394 * There are too many global variables to make the "warp" behavior
395 * applicable to other clocks. But a clock argument removes the
396 * need for if statements all over the place.
398 if (type != QEMU_CLOCK_VIRTUAL || !use_icount) {
399 return;
403 * If the CPUs have been sleeping, advance QEMU_CLOCK_VIRTUAL timer now.
404 * This ensures that the deadline for the timer is computed correctly below.
405 * This also makes sure that the insn counter is synchronized before the
406 * CPU starts running, in case the CPU is woken by an event other than
407 * the earliest QEMU_CLOCK_VIRTUAL timer.
409 icount_warp_rt(NULL);
410 timer_del(icount_warp_timer);
411 if (!all_cpu_threads_idle()) {
412 return;
415 if (qtest_enabled()) {
416 /* When testing, qtest commands advance icount. */
417 return;
420 /* We want to use the earliest deadline from ALL vm_clocks */
421 clock = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
422 deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
423 if (deadline < 0) {
424 return;
427 if (deadline > 0) {
429 * Ensure QEMU_CLOCK_VIRTUAL proceeds even when the virtual CPU goes to
430 * sleep. Otherwise, the CPU might be waiting for a future timer
431 * interrupt to wake it up, but the interrupt never comes because
432 * the vCPU isn't running any insns and thus doesn't advance the
433 * QEMU_CLOCK_VIRTUAL.
435 * An extreme solution for this problem would be to never let VCPUs
436 * sleep in icount mode if there is a pending QEMU_CLOCK_VIRTUAL
437 * timer; rather time could just advance to the next QEMU_CLOCK_VIRTUAL
438 * event. Instead, we do stop VCPUs and only advance QEMU_CLOCK_VIRTUAL
439 * after some e"real" time, (related to the time left until the next
440 * event) has passed. The QEMU_CLOCK_REALTIME timer will do this.
441 * This avoids that the warps are visible externally; for example,
442 * you will not be sending network packets continuously instead of
443 * every 100ms.
445 seqlock_write_lock(&timers_state.vm_clock_seqlock);
446 if (vm_clock_warp_start == -1 || vm_clock_warp_start > clock) {
447 vm_clock_warp_start = clock;
449 seqlock_write_unlock(&timers_state.vm_clock_seqlock);
450 timer_mod_anticipate(icount_warp_timer, clock + deadline);
451 } else if (deadline == 0) {
452 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
456 static bool icount_state_needed(void *opaque)
458 return use_icount;
462 * This is a subsection for icount migration.
464 static const VMStateDescription icount_vmstate_timers = {
465 .name = "timer/icount",
466 .version_id = 1,
467 .minimum_version_id = 1,
468 .fields = (VMStateField[]) {
469 VMSTATE_INT64(qemu_icount_bias, TimersState),
470 VMSTATE_INT64(qemu_icount, TimersState),
471 VMSTATE_END_OF_LIST()
475 static const VMStateDescription vmstate_timers = {
476 .name = "timer",
477 .version_id = 2,
478 .minimum_version_id = 1,
479 .fields = (VMStateField[]) {
480 VMSTATE_INT64(cpu_ticks_offset, TimersState),
481 VMSTATE_INT64(dummy, TimersState),
482 VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2),
483 VMSTATE_END_OF_LIST()
485 .subsections = (VMStateSubsection[]) {
487 .vmsd = &icount_vmstate_timers,
488 .needed = icount_state_needed,
489 }, {
490 /* empty */
495 void configure_icount(QemuOpts *opts, Error **errp)
497 const char *option;
498 char *rem_str = NULL;
500 seqlock_init(&timers_state.vm_clock_seqlock, NULL);
501 vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
502 option = qemu_opt_get(opts, "shift");
503 if (!option) {
504 if (qemu_opt_get(opts, "align") != NULL) {
505 error_setg(errp, "Please specify shift option when using align");
507 return;
509 icount_align_option = qemu_opt_get_bool(opts, "align", false);
510 icount_warp_timer = timer_new_ns(QEMU_CLOCK_REALTIME,
511 icount_warp_rt, NULL);
512 if (strcmp(option, "auto") != 0) {
513 errno = 0;
514 icount_time_shift = strtol(option, &rem_str, 0);
515 if (errno != 0 || *rem_str != '\0' || !strlen(option)) {
516 error_setg(errp, "icount: Invalid shift value");
518 use_icount = 1;
519 return;
520 } else if (icount_align_option) {
521 error_setg(errp, "shift=auto and align=on are incompatible");
524 use_icount = 2;
526 /* 125MIPS seems a reasonable initial guess at the guest speed.
527 It will be corrected fairly quickly anyway. */
528 icount_time_shift = 3;
530 /* Have both realtime and virtual time triggers for speed adjustment.
531 The realtime trigger catches emulated time passing too slowly,
532 the virtual time trigger catches emulated time passing too fast.
533 Realtime triggers occur even when idle, so use them less frequently
534 than VM triggers. */
535 icount_rt_timer = timer_new_ms(QEMU_CLOCK_REALTIME,
536 icount_adjust_rt, NULL);
537 timer_mod(icount_rt_timer,
538 qemu_clock_get_ms(QEMU_CLOCK_REALTIME) + 1000);
539 icount_vm_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
540 icount_adjust_vm, NULL);
541 timer_mod(icount_vm_timer,
542 qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
543 get_ticks_per_sec() / 10);
546 /***********************************************************/
547 void hw_error(const char *fmt, ...)
549 va_list ap;
550 CPUState *cpu;
552 va_start(ap, fmt);
553 fprintf(stderr, "qemu: hardware error: ");
554 vfprintf(stderr, fmt, ap);
555 fprintf(stderr, "\n");
556 CPU_FOREACH(cpu) {
557 fprintf(stderr, "CPU #%d:\n", cpu->cpu_index);
558 cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_FPU);
560 va_end(ap);
561 abort();
564 void cpu_synchronize_all_states(void)
566 CPUState *cpu;
568 CPU_FOREACH(cpu) {
569 cpu_synchronize_state(cpu);
573 void cpu_synchronize_all_post_reset(void)
575 CPUState *cpu;
577 CPU_FOREACH(cpu) {
578 cpu_synchronize_post_reset(cpu);
582 void cpu_synchronize_all_post_init(void)
584 CPUState *cpu;
586 CPU_FOREACH(cpu) {
587 cpu_synchronize_post_init(cpu);
591 static int do_vm_stop(RunState state)
593 int ret = 0;
595 if (runstate_is_running()) {
596 cpu_disable_ticks();
597 pause_all_vcpus();
598 runstate_set(state);
599 vm_state_notify(0, state);
600 qapi_event_send_stop(&error_abort);
603 bdrv_drain_all();
604 ret = bdrv_flush_all();
606 return ret;
609 static bool cpu_can_run(CPUState *cpu)
611 if (cpu->stop) {
612 return false;
614 if (cpu_is_stopped(cpu)) {
615 return false;
617 return true;
620 static void cpu_handle_guest_debug(CPUState *cpu)
622 gdb_set_stop_cpu(cpu);
623 qemu_system_debug_request();
624 cpu->stopped = true;
627 static void cpu_signal(int sig)
629 if (current_cpu) {
630 cpu_exit(current_cpu);
632 exit_request = 1;
635 #ifdef CONFIG_LINUX
636 static void sigbus_reraise(void)
638 sigset_t set;
639 struct sigaction action;
641 memset(&action, 0, sizeof(action));
642 action.sa_handler = SIG_DFL;
643 if (!sigaction(SIGBUS, &action, NULL)) {
644 raise(SIGBUS);
645 sigemptyset(&set);
646 sigaddset(&set, SIGBUS);
647 sigprocmask(SIG_UNBLOCK, &set, NULL);
649 perror("Failed to re-raise SIGBUS!\n");
650 abort();
653 static void sigbus_handler(int n, struct qemu_signalfd_siginfo *siginfo,
654 void *ctx)
656 if (kvm_on_sigbus(siginfo->ssi_code,
657 (void *)(intptr_t)siginfo->ssi_addr)) {
658 sigbus_reraise();
662 static void qemu_init_sigbus(void)
664 struct sigaction action;
666 memset(&action, 0, sizeof(action));
667 action.sa_flags = SA_SIGINFO;
668 action.sa_sigaction = (void (*)(int, siginfo_t*, void*))sigbus_handler;
669 sigaction(SIGBUS, &action, NULL);
671 prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0);
674 static void qemu_kvm_eat_signals(CPUState *cpu)
676 struct timespec ts = { 0, 0 };
677 siginfo_t siginfo;
678 sigset_t waitset;
679 sigset_t chkset;
680 int r;
682 sigemptyset(&waitset);
683 sigaddset(&waitset, SIG_IPI);
684 sigaddset(&waitset, SIGBUS);
686 do {
687 r = sigtimedwait(&waitset, &siginfo, &ts);
688 if (r == -1 && !(errno == EAGAIN || errno == EINTR)) {
689 perror("sigtimedwait");
690 exit(1);
693 switch (r) {
694 case SIGBUS:
695 if (kvm_on_sigbus_vcpu(cpu, siginfo.si_code, siginfo.si_addr)) {
696 sigbus_reraise();
698 break;
699 default:
700 break;
703 r = sigpending(&chkset);
704 if (r == -1) {
705 perror("sigpending");
706 exit(1);
708 } while (sigismember(&chkset, SIG_IPI) || sigismember(&chkset, SIGBUS));
711 #else /* !CONFIG_LINUX */
713 static void qemu_init_sigbus(void)
717 static void qemu_kvm_eat_signals(CPUState *cpu)
720 #endif /* !CONFIG_LINUX */
722 #ifndef _WIN32
723 static void dummy_signal(int sig)
727 static void qemu_kvm_init_cpu_signals(CPUState *cpu)
729 int r;
730 sigset_t set;
731 struct sigaction sigact;
733 memset(&sigact, 0, sizeof(sigact));
734 sigact.sa_handler = dummy_signal;
735 sigaction(SIG_IPI, &sigact, NULL);
737 pthread_sigmask(SIG_BLOCK, NULL, &set);
738 sigdelset(&set, SIG_IPI);
739 sigdelset(&set, SIGBUS);
740 r = kvm_set_signal_mask(cpu, &set);
741 if (r) {
742 fprintf(stderr, "kvm_set_signal_mask: %s\n", strerror(-r));
743 exit(1);
747 static void qemu_tcg_init_cpu_signals(void)
749 sigset_t set;
750 struct sigaction sigact;
752 memset(&sigact, 0, sizeof(sigact));
753 sigact.sa_handler = cpu_signal;
754 sigaction(SIG_IPI, &sigact, NULL);
756 sigemptyset(&set);
757 sigaddset(&set, SIG_IPI);
758 pthread_sigmask(SIG_UNBLOCK, &set, NULL);
761 #else /* _WIN32 */
762 static void qemu_kvm_init_cpu_signals(CPUState *cpu)
764 abort();
767 static void qemu_tcg_init_cpu_signals(void)
770 #endif /* _WIN32 */
772 static QemuMutex qemu_global_mutex;
773 static QemuCond qemu_io_proceeded_cond;
774 static bool iothread_requesting_mutex;
776 static QemuThread io_thread;
778 static QemuThread *tcg_cpu_thread;
779 static QemuCond *tcg_halt_cond;
781 /* cpu creation */
782 static QemuCond qemu_cpu_cond;
783 /* system init */
784 static QemuCond qemu_pause_cond;
785 static QemuCond qemu_work_cond;
787 void qemu_init_cpu_loop(void)
789 qemu_init_sigbus();
790 qemu_cond_init(&qemu_cpu_cond);
791 qemu_cond_init(&qemu_pause_cond);
792 qemu_cond_init(&qemu_work_cond);
793 qemu_cond_init(&qemu_io_proceeded_cond);
794 qemu_mutex_init(&qemu_global_mutex);
796 qemu_thread_get_self(&io_thread);
799 void run_on_cpu(CPUState *cpu, void (*func)(void *data), void *data)
801 struct qemu_work_item wi;
803 if (qemu_cpu_is_self(cpu)) {
804 func(data);
805 return;
808 wi.func = func;
809 wi.data = data;
810 wi.free = false;
811 if (cpu->queued_work_first == NULL) {
812 cpu->queued_work_first = &wi;
813 } else {
814 cpu->queued_work_last->next = &wi;
816 cpu->queued_work_last = &wi;
817 wi.next = NULL;
818 wi.done = false;
820 qemu_cpu_kick(cpu);
821 while (!wi.done) {
822 CPUState *self_cpu = current_cpu;
824 qemu_cond_wait(&qemu_work_cond, &qemu_global_mutex);
825 current_cpu = self_cpu;
829 void async_run_on_cpu(CPUState *cpu, void (*func)(void *data), void *data)
831 struct qemu_work_item *wi;
833 if (qemu_cpu_is_self(cpu)) {
834 func(data);
835 return;
838 wi = g_malloc0(sizeof(struct qemu_work_item));
839 wi->func = func;
840 wi->data = data;
841 wi->free = true;
842 if (cpu->queued_work_first == NULL) {
843 cpu->queued_work_first = wi;
844 } else {
845 cpu->queued_work_last->next = wi;
847 cpu->queued_work_last = wi;
848 wi->next = NULL;
849 wi->done = false;
851 qemu_cpu_kick(cpu);
854 static void flush_queued_work(CPUState *cpu)
856 struct qemu_work_item *wi;
858 if (cpu->queued_work_first == NULL) {
859 return;
862 while ((wi = cpu->queued_work_first)) {
863 cpu->queued_work_first = wi->next;
864 wi->func(wi->data);
865 wi->done = true;
866 if (wi->free) {
867 g_free(wi);
870 cpu->queued_work_last = NULL;
871 qemu_cond_broadcast(&qemu_work_cond);
874 static void qemu_wait_io_event_common(CPUState *cpu)
876 if (cpu->stop) {
877 cpu->stop = false;
878 cpu->stopped = true;
879 qemu_cond_signal(&qemu_pause_cond);
881 flush_queued_work(cpu);
882 cpu->thread_kicked = false;
885 static void qemu_tcg_wait_io_event(void)
887 CPUState *cpu;
889 while (all_cpu_threads_idle()) {
890 /* Start accounting real time to the virtual clock if the CPUs
891 are idle. */
892 qemu_clock_warp(QEMU_CLOCK_VIRTUAL);
893 qemu_cond_wait(tcg_halt_cond, &qemu_global_mutex);
896 while (iothread_requesting_mutex) {
897 qemu_cond_wait(&qemu_io_proceeded_cond, &qemu_global_mutex);
900 CPU_FOREACH(cpu) {
901 qemu_wait_io_event_common(cpu);
905 static void qemu_kvm_wait_io_event(CPUState *cpu)
907 while (cpu_thread_is_idle(cpu)) {
908 qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
911 qemu_kvm_eat_signals(cpu);
912 qemu_wait_io_event_common(cpu);
915 static void *qemu_kvm_cpu_thread_fn(void *arg)
917 CPUState *cpu = arg;
918 int r;
920 qemu_mutex_lock(&qemu_global_mutex);
921 qemu_thread_get_self(cpu->thread);
922 cpu->thread_id = qemu_get_thread_id();
923 current_cpu = cpu;
925 r = kvm_init_vcpu(cpu);
926 if (r < 0) {
927 fprintf(stderr, "kvm_init_vcpu failed: %s\n", strerror(-r));
928 exit(1);
931 qemu_kvm_init_cpu_signals(cpu);
933 /* signal CPU creation */
934 cpu->created = true;
935 qemu_cond_signal(&qemu_cpu_cond);
937 while (1) {
938 if (cpu_can_run(cpu)) {
939 r = kvm_cpu_exec(cpu);
940 if (r == EXCP_DEBUG) {
941 cpu_handle_guest_debug(cpu);
944 qemu_kvm_wait_io_event(cpu);
947 return NULL;
950 static void *qemu_dummy_cpu_thread_fn(void *arg)
952 #ifdef _WIN32
953 fprintf(stderr, "qtest is not supported under Windows\n");
954 exit(1);
955 #else
956 CPUState *cpu = arg;
957 sigset_t waitset;
958 int r;
960 qemu_mutex_lock_iothread();
961 qemu_thread_get_self(cpu->thread);
962 cpu->thread_id = qemu_get_thread_id();
964 sigemptyset(&waitset);
965 sigaddset(&waitset, SIG_IPI);
967 /* signal CPU creation */
968 cpu->created = true;
969 qemu_cond_signal(&qemu_cpu_cond);
971 current_cpu = cpu;
972 while (1) {
973 current_cpu = NULL;
974 qemu_mutex_unlock_iothread();
975 do {
976 int sig;
977 r = sigwait(&waitset, &sig);
978 } while (r == -1 && (errno == EAGAIN || errno == EINTR));
979 if (r == -1) {
980 perror("sigwait");
981 exit(1);
983 qemu_mutex_lock_iothread();
984 current_cpu = cpu;
985 qemu_wait_io_event_common(cpu);
988 return NULL;
989 #endif
992 static void tcg_exec_all(void);
994 static void *qemu_tcg_cpu_thread_fn(void *arg)
996 CPUState *cpu = arg;
998 qemu_tcg_init_cpu_signals();
999 qemu_thread_get_self(cpu->thread);
1001 qemu_mutex_lock(&qemu_global_mutex);
1002 CPU_FOREACH(cpu) {
1003 cpu->thread_id = qemu_get_thread_id();
1004 cpu->created = true;
1006 qemu_cond_signal(&qemu_cpu_cond);
1008 /* wait for initial kick-off after machine start */
1009 while (QTAILQ_FIRST(&cpus)->stopped) {
1010 qemu_cond_wait(tcg_halt_cond, &qemu_global_mutex);
1012 /* process any pending work */
1013 CPU_FOREACH(cpu) {
1014 qemu_wait_io_event_common(cpu);
1018 while (1) {
1019 tcg_exec_all();
1021 if (use_icount) {
1022 int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1024 if (deadline == 0) {
1025 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
1028 qemu_tcg_wait_io_event();
1031 return NULL;
1034 static void qemu_cpu_kick_thread(CPUState *cpu)
1036 #ifndef _WIN32
1037 int err;
1039 err = pthread_kill(cpu->thread->thread, SIG_IPI);
1040 if (err) {
1041 fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
1042 exit(1);
1044 #else /* _WIN32 */
1045 if (!qemu_cpu_is_self(cpu)) {
1046 CONTEXT tcgContext;
1048 if (SuspendThread(cpu->hThread) == (DWORD)-1) {
1049 fprintf(stderr, "qemu:%s: GetLastError:%lu\n", __func__,
1050 GetLastError());
1051 exit(1);
1054 /* On multi-core systems, we are not sure that the thread is actually
1055 * suspended until we can get the context.
1057 tcgContext.ContextFlags = CONTEXT_CONTROL;
1058 while (GetThreadContext(cpu->hThread, &tcgContext) != 0) {
1059 continue;
1062 cpu_signal(0);
1064 if (ResumeThread(cpu->hThread) == (DWORD)-1) {
1065 fprintf(stderr, "qemu:%s: GetLastError:%lu\n", __func__,
1066 GetLastError());
1067 exit(1);
1070 #endif
1073 void qemu_cpu_kick(CPUState *cpu)
1075 qemu_cond_broadcast(cpu->halt_cond);
1076 if (!tcg_enabled() && !cpu->thread_kicked) {
1077 qemu_cpu_kick_thread(cpu);
1078 cpu->thread_kicked = true;
1082 void qemu_cpu_kick_self(void)
1084 #ifndef _WIN32
1085 assert(current_cpu);
1087 if (!current_cpu->thread_kicked) {
1088 qemu_cpu_kick_thread(current_cpu);
1089 current_cpu->thread_kicked = true;
1091 #else
1092 abort();
1093 #endif
1096 bool qemu_cpu_is_self(CPUState *cpu)
1098 return qemu_thread_is_self(cpu->thread);
1101 static bool qemu_in_vcpu_thread(void)
1103 return current_cpu && qemu_cpu_is_self(current_cpu);
1106 void qemu_mutex_lock_iothread(void)
1108 if (!tcg_enabled()) {
1109 qemu_mutex_lock(&qemu_global_mutex);
1110 } else {
1111 iothread_requesting_mutex = true;
1112 if (qemu_mutex_trylock(&qemu_global_mutex)) {
1113 qemu_cpu_kick_thread(first_cpu);
1114 qemu_mutex_lock(&qemu_global_mutex);
1116 iothread_requesting_mutex = false;
1117 qemu_cond_broadcast(&qemu_io_proceeded_cond);
1121 void qemu_mutex_unlock_iothread(void)
1123 qemu_mutex_unlock(&qemu_global_mutex);
1126 static int all_vcpus_paused(void)
1128 CPUState *cpu;
1130 CPU_FOREACH(cpu) {
1131 if (!cpu->stopped) {
1132 return 0;
1136 return 1;
1139 void pause_all_vcpus(void)
1141 CPUState *cpu;
1143 qemu_clock_enable(QEMU_CLOCK_VIRTUAL, false);
1144 CPU_FOREACH(cpu) {
1145 cpu->stop = true;
1146 qemu_cpu_kick(cpu);
1149 if (qemu_in_vcpu_thread()) {
1150 cpu_stop_current();
1151 if (!kvm_enabled()) {
1152 CPU_FOREACH(cpu) {
1153 cpu->stop = false;
1154 cpu->stopped = true;
1156 return;
1160 while (!all_vcpus_paused()) {
1161 qemu_cond_wait(&qemu_pause_cond, &qemu_global_mutex);
1162 CPU_FOREACH(cpu) {
1163 qemu_cpu_kick(cpu);
1168 void cpu_resume(CPUState *cpu)
1170 cpu->stop = false;
1171 cpu->stopped = false;
1172 qemu_cpu_kick(cpu);
1175 void resume_all_vcpus(void)
1177 CPUState *cpu;
1179 qemu_clock_enable(QEMU_CLOCK_VIRTUAL, true);
1180 CPU_FOREACH(cpu) {
1181 cpu_resume(cpu);
1185 /* For temporary buffers for forming a name */
1186 #define VCPU_THREAD_NAME_SIZE 16
1188 static void qemu_tcg_init_vcpu(CPUState *cpu)
1190 char thread_name[VCPU_THREAD_NAME_SIZE];
1192 tcg_cpu_address_space_init(cpu, cpu->as);
1194 /* share a single thread for all cpus with TCG */
1195 if (!tcg_cpu_thread) {
1196 cpu->thread = g_malloc0(sizeof(QemuThread));
1197 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1198 qemu_cond_init(cpu->halt_cond);
1199 tcg_halt_cond = cpu->halt_cond;
1200 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
1201 cpu->cpu_index);
1202 qemu_thread_create(cpu->thread, thread_name, qemu_tcg_cpu_thread_fn,
1203 cpu, QEMU_THREAD_JOINABLE);
1204 #ifdef _WIN32
1205 cpu->hThread = qemu_thread_get_handle(cpu->thread);
1206 #endif
1207 while (!cpu->created) {
1208 qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1210 tcg_cpu_thread = cpu->thread;
1211 } else {
1212 cpu->thread = tcg_cpu_thread;
1213 cpu->halt_cond = tcg_halt_cond;
1217 static void qemu_kvm_start_vcpu(CPUState *cpu)
1219 char thread_name[VCPU_THREAD_NAME_SIZE];
1221 cpu->thread = g_malloc0(sizeof(QemuThread));
1222 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1223 qemu_cond_init(cpu->halt_cond);
1224 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/KVM",
1225 cpu->cpu_index);
1226 qemu_thread_create(cpu->thread, thread_name, qemu_kvm_cpu_thread_fn,
1227 cpu, QEMU_THREAD_JOINABLE);
1228 while (!cpu->created) {
1229 qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1233 static void qemu_dummy_start_vcpu(CPUState *cpu)
1235 char thread_name[VCPU_THREAD_NAME_SIZE];
1237 cpu->thread = g_malloc0(sizeof(QemuThread));
1238 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1239 qemu_cond_init(cpu->halt_cond);
1240 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/DUMMY",
1241 cpu->cpu_index);
1242 qemu_thread_create(cpu->thread, thread_name, qemu_dummy_cpu_thread_fn, cpu,
1243 QEMU_THREAD_JOINABLE);
1244 while (!cpu->created) {
1245 qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1249 void qemu_init_vcpu(CPUState *cpu)
1251 cpu->nr_cores = smp_cores;
1252 cpu->nr_threads = smp_threads;
1253 cpu->stopped = true;
1254 if (kvm_enabled()) {
1255 qemu_kvm_start_vcpu(cpu);
1256 } else if (tcg_enabled()) {
1257 qemu_tcg_init_vcpu(cpu);
1258 } else {
1259 qemu_dummy_start_vcpu(cpu);
1263 void cpu_stop_current(void)
1265 if (current_cpu) {
1266 current_cpu->stop = false;
1267 current_cpu->stopped = true;
1268 cpu_exit(current_cpu);
1269 qemu_cond_signal(&qemu_pause_cond);
1273 int vm_stop(RunState state)
1275 if (qemu_in_vcpu_thread()) {
1276 qemu_system_vmstop_request_prepare();
1277 qemu_system_vmstop_request(state);
1279 * FIXME: should not return to device code in case
1280 * vm_stop() has been requested.
1282 cpu_stop_current();
1283 return 0;
1286 return do_vm_stop(state);
1289 /* does a state transition even if the VM is already stopped,
1290 current state is forgotten forever */
1291 int vm_stop_force_state(RunState state)
1293 if (runstate_is_running()) {
1294 return vm_stop(state);
1295 } else {
1296 runstate_set(state);
1297 /* Make sure to return an error if the flush in a previous vm_stop()
1298 * failed. */
1299 return bdrv_flush_all();
1303 static int tcg_cpu_exec(CPUArchState *env)
1305 CPUState *cpu = ENV_GET_CPU(env);
1306 int ret;
1307 #ifdef CONFIG_PROFILER
1308 int64_t ti;
1309 #endif
1311 #ifdef CONFIG_PROFILER
1312 ti = profile_getclock();
1313 #endif
1314 if (use_icount) {
1315 int64_t count;
1316 int64_t deadline;
1317 int decr;
1318 timers_state.qemu_icount -= (cpu->icount_decr.u16.low
1319 + cpu->icount_extra);
1320 cpu->icount_decr.u16.low = 0;
1321 cpu->icount_extra = 0;
1322 deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1324 /* Maintain prior (possibly buggy) behaviour where if no deadline
1325 * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
1326 * INT32_MAX nanoseconds ahead, we still use INT32_MAX
1327 * nanoseconds.
1329 if ((deadline < 0) || (deadline > INT32_MAX)) {
1330 deadline = INT32_MAX;
1333 count = qemu_icount_round(deadline);
1334 timers_state.qemu_icount += count;
1335 decr = (count > 0xffff) ? 0xffff : count;
1336 count -= decr;
1337 cpu->icount_decr.u16.low = decr;
1338 cpu->icount_extra = count;
1340 ret = cpu_exec(env);
1341 #ifdef CONFIG_PROFILER
1342 qemu_time += profile_getclock() - ti;
1343 #endif
1344 if (use_icount) {
1345 /* Fold pending instructions back into the
1346 instruction counter, and clear the interrupt flag. */
1347 timers_state.qemu_icount -= (cpu->icount_decr.u16.low
1348 + cpu->icount_extra);
1349 cpu->icount_decr.u32 = 0;
1350 cpu->icount_extra = 0;
1352 return ret;
1355 static void tcg_exec_all(void)
1357 int r;
1359 /* Account partial waits to QEMU_CLOCK_VIRTUAL. */
1360 qemu_clock_warp(QEMU_CLOCK_VIRTUAL);
1362 if (next_cpu == NULL) {
1363 next_cpu = first_cpu;
1365 for (; next_cpu != NULL && !exit_request; next_cpu = CPU_NEXT(next_cpu)) {
1366 CPUState *cpu = next_cpu;
1367 CPUArchState *env = cpu->env_ptr;
1369 qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
1370 (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
1372 if (cpu_can_run(cpu)) {
1373 r = tcg_cpu_exec(env);
1374 if (r == EXCP_DEBUG) {
1375 cpu_handle_guest_debug(cpu);
1376 break;
1378 } else if (cpu->stop || cpu->stopped) {
1379 break;
1382 exit_request = 0;
1385 void list_cpus(FILE *f, fprintf_function cpu_fprintf, const char *optarg)
1387 /* XXX: implement xxx_cpu_list for targets that still miss it */
1388 #if defined(cpu_list)
1389 cpu_list(f, cpu_fprintf);
1390 #endif
1393 CpuInfoList *qmp_query_cpus(Error **errp)
1395 CpuInfoList *head = NULL, *cur_item = NULL;
1396 CPUState *cpu;
1398 CPU_FOREACH(cpu) {
1399 CpuInfoList *info;
1400 #if defined(TARGET_I386)
1401 X86CPU *x86_cpu = X86_CPU(cpu);
1402 CPUX86State *env = &x86_cpu->env;
1403 #elif defined(TARGET_PPC)
1404 PowerPCCPU *ppc_cpu = POWERPC_CPU(cpu);
1405 CPUPPCState *env = &ppc_cpu->env;
1406 #elif defined(TARGET_SPARC)
1407 SPARCCPU *sparc_cpu = SPARC_CPU(cpu);
1408 CPUSPARCState *env = &sparc_cpu->env;
1409 #elif defined(TARGET_MIPS)
1410 MIPSCPU *mips_cpu = MIPS_CPU(cpu);
1411 CPUMIPSState *env = &mips_cpu->env;
1412 #endif
1414 cpu_synchronize_state(cpu);
1416 info = g_malloc0(sizeof(*info));
1417 info->value = g_malloc0(sizeof(*info->value));
1418 info->value->CPU = cpu->cpu_index;
1419 info->value->current = (cpu == first_cpu);
1420 info->value->halted = cpu->halted;
1421 info->value->thread_id = cpu->thread_id;
1422 #if defined(TARGET_I386)
1423 info->value->has_pc = true;
1424 info->value->pc = env->eip + env->segs[R_CS].base;
1425 #elif defined(TARGET_PPC)
1426 info->value->has_nip = true;
1427 info->value->nip = env->nip;
1428 #elif defined(TARGET_SPARC)
1429 info->value->has_pc = true;
1430 info->value->pc = env->pc;
1431 info->value->has_npc = true;
1432 info->value->npc = env->npc;
1433 #elif defined(TARGET_MIPS)
1434 info->value->has_PC = true;
1435 info->value->PC = env->active_tc.PC;
1436 #endif
1438 /* XXX: waiting for the qapi to support GSList */
1439 if (!cur_item) {
1440 head = cur_item = info;
1441 } else {
1442 cur_item->next = info;
1443 cur_item = info;
1447 return head;
1450 void qmp_memsave(int64_t addr, int64_t size, const char *filename,
1451 bool has_cpu, int64_t cpu_index, Error **errp)
1453 FILE *f;
1454 uint32_t l;
1455 CPUState *cpu;
1456 uint8_t buf[1024];
1458 if (!has_cpu) {
1459 cpu_index = 0;
1462 cpu = qemu_get_cpu(cpu_index);
1463 if (cpu == NULL) {
1464 error_set(errp, QERR_INVALID_PARAMETER_VALUE, "cpu-index",
1465 "a CPU number");
1466 return;
1469 f = fopen(filename, "wb");
1470 if (!f) {
1471 error_setg_file_open(errp, errno, filename);
1472 return;
1475 while (size != 0) {
1476 l = sizeof(buf);
1477 if (l > size)
1478 l = size;
1479 if (cpu_memory_rw_debug(cpu, addr, buf, l, 0) != 0) {
1480 error_setg(errp, "Invalid addr 0x%016" PRIx64 "specified", addr);
1481 goto exit;
1483 if (fwrite(buf, 1, l, f) != l) {
1484 error_set(errp, QERR_IO_ERROR);
1485 goto exit;
1487 addr += l;
1488 size -= l;
1491 exit:
1492 fclose(f);
1495 void qmp_pmemsave(int64_t addr, int64_t size, const char *filename,
1496 Error **errp)
1498 FILE *f;
1499 uint32_t l;
1500 uint8_t buf[1024];
1502 f = fopen(filename, "wb");
1503 if (!f) {
1504 error_setg_file_open(errp, errno, filename);
1505 return;
1508 while (size != 0) {
1509 l = sizeof(buf);
1510 if (l > size)
1511 l = size;
1512 cpu_physical_memory_read(addr, buf, l);
1513 if (fwrite(buf, 1, l, f) != l) {
1514 error_set(errp, QERR_IO_ERROR);
1515 goto exit;
1517 addr += l;
1518 size -= l;
1521 exit:
1522 fclose(f);
1525 void qmp_inject_nmi(Error **errp)
1527 #if defined(TARGET_I386)
1528 CPUState *cs;
1530 CPU_FOREACH(cs) {
1531 X86CPU *cpu = X86_CPU(cs);
1533 if (!cpu->apic_state) {
1534 cpu_interrupt(cs, CPU_INTERRUPT_NMI);
1535 } else {
1536 apic_deliver_nmi(cpu->apic_state);
1539 #elif defined(TARGET_S390X)
1540 CPUState *cs;
1541 S390CPU *cpu;
1543 CPU_FOREACH(cs) {
1544 cpu = S390_CPU(cs);
1545 if (cpu->env.cpu_num == monitor_get_cpu_index()) {
1546 if (s390_cpu_restart(S390_CPU(cs)) == -1) {
1547 error_set(errp, QERR_UNSUPPORTED);
1548 return;
1550 break;
1553 #else
1554 error_set(errp, QERR_UNSUPPORTED);
1555 #endif
1558 void dump_drift_info(FILE *f, fprintf_function cpu_fprintf)
1560 if (!use_icount) {
1561 return;
1564 cpu_fprintf(f, "Host - Guest clock %"PRIi64" ms\n",
1565 (cpu_get_clock() - cpu_get_icount())/SCALE_MS);
1566 if (icount_align_option) {
1567 cpu_fprintf(f, "Max guest delay %"PRIi64" ms\n", -max_delay/SCALE_MS);
1568 cpu_fprintf(f, "Max guest advance %"PRIi64" ms\n", max_advance/SCALE_MS);
1569 } else {
1570 cpu_fprintf(f, "Max guest delay NA\n");
1571 cpu_fprintf(f, "Max guest advance NA\n");