cpu-exec: Add sleeping algorithm
[qemu/kevin.git] / cpus.c
blob19245e99b9e6c450036dd52c65a21e04382c9f5a
1 /*
2 * QEMU System Emulator
4 * Copyright (c) 2003-2008 Fabrice Bellard
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
25 /* Needed early for CONFIG_BSD etc. */
26 #include "config-host.h"
28 #include "monitor/monitor.h"
29 #include "qapi/qmp/qerror.h"
30 #include "sysemu/sysemu.h"
31 #include "exec/gdbstub.h"
32 #include "sysemu/dma.h"
33 #include "sysemu/kvm.h"
34 #include "qmp-commands.h"
36 #include "qemu/thread.h"
37 #include "sysemu/cpus.h"
38 #include "sysemu/qtest.h"
39 #include "qemu/main-loop.h"
40 #include "qemu/bitmap.h"
41 #include "qemu/seqlock.h"
42 #include "qapi-event.h"
44 #ifndef _WIN32
45 #include "qemu/compatfd.h"
46 #endif
48 #ifdef CONFIG_LINUX
50 #include <sys/prctl.h>
52 #ifndef PR_MCE_KILL
53 #define PR_MCE_KILL 33
54 #endif
56 #ifndef PR_MCE_KILL_SET
57 #define PR_MCE_KILL_SET 1
58 #endif
60 #ifndef PR_MCE_KILL_EARLY
61 #define PR_MCE_KILL_EARLY 1
62 #endif
64 #endif /* CONFIG_LINUX */
66 static CPUState *next_cpu;
68 bool cpu_is_stopped(CPUState *cpu)
70 return cpu->stopped || !runstate_is_running();
73 static bool cpu_thread_is_idle(CPUState *cpu)
75 if (cpu->stop || cpu->queued_work_first) {
76 return false;
78 if (cpu_is_stopped(cpu)) {
79 return true;
81 if (!cpu->halted || cpu_has_work(cpu) ||
82 kvm_halt_in_kernel()) {
83 return false;
85 return true;
88 static bool all_cpu_threads_idle(void)
90 CPUState *cpu;
92 CPU_FOREACH(cpu) {
93 if (!cpu_thread_is_idle(cpu)) {
94 return false;
97 return true;
100 /***********************************************************/
101 /* guest cycle counter */
103 /* Protected by TimersState seqlock */
105 static int64_t vm_clock_warp_start = -1;
106 /* Conversion factor from emulated instructions to virtual clock ticks. */
107 static int icount_time_shift;
108 /* Arbitrarily pick 1MIPS as the minimum allowable speed. */
109 #define MAX_ICOUNT_SHIFT 10
111 static QEMUTimer *icount_rt_timer;
112 static QEMUTimer *icount_vm_timer;
113 static QEMUTimer *icount_warp_timer;
115 typedef struct TimersState {
116 /* Protected by BQL. */
117 int64_t cpu_ticks_prev;
118 int64_t cpu_ticks_offset;
120 /* cpu_clock_offset can be read out of BQL, so protect it with
121 * this lock.
123 QemuSeqLock vm_clock_seqlock;
124 int64_t cpu_clock_offset;
125 int32_t cpu_ticks_enabled;
126 int64_t dummy;
128 /* Compensate for varying guest execution speed. */
129 int64_t qemu_icount_bias;
130 /* Only written by TCG thread */
131 int64_t qemu_icount;
132 } TimersState;
134 static TimersState timers_state;
136 /* Return the virtual CPU time, based on the instruction counter. */
137 static int64_t cpu_get_icount_locked(void)
139 int64_t icount;
140 CPUState *cpu = current_cpu;
142 icount = timers_state.qemu_icount;
143 if (cpu) {
144 if (!cpu_can_do_io(cpu)) {
145 fprintf(stderr, "Bad clock read\n");
147 icount -= (cpu->icount_decr.u16.low + cpu->icount_extra);
149 return timers_state.qemu_icount_bias + cpu_icount_to_ns(icount);
152 int64_t cpu_get_icount(void)
154 int64_t icount;
155 unsigned start;
157 do {
158 start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
159 icount = cpu_get_icount_locked();
160 } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
162 return icount;
165 int64_t cpu_icount_to_ns(int64_t icount)
167 return icount << icount_time_shift;
170 /* return the host CPU cycle counter and handle stop/restart */
171 /* Caller must hold the BQL */
172 int64_t cpu_get_ticks(void)
174 int64_t ticks;
176 if (use_icount) {
177 return cpu_get_icount();
180 ticks = timers_state.cpu_ticks_offset;
181 if (timers_state.cpu_ticks_enabled) {
182 ticks += cpu_get_real_ticks();
185 if (timers_state.cpu_ticks_prev > ticks) {
186 /* Note: non increasing ticks may happen if the host uses
187 software suspend */
188 timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks;
189 ticks = timers_state.cpu_ticks_prev;
192 timers_state.cpu_ticks_prev = ticks;
193 return ticks;
196 static int64_t cpu_get_clock_locked(void)
198 int64_t ticks;
200 ticks = timers_state.cpu_clock_offset;
201 if (timers_state.cpu_ticks_enabled) {
202 ticks += get_clock();
205 return ticks;
208 /* return the host CPU monotonic timer and handle stop/restart */
209 int64_t cpu_get_clock(void)
211 int64_t ti;
212 unsigned start;
214 do {
215 start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
216 ti = cpu_get_clock_locked();
217 } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
219 return ti;
222 /* return the offset between the host clock and virtual CPU clock */
223 int64_t cpu_get_clock_offset(void)
225 int64_t ti;
226 unsigned start;
228 do {
229 start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
230 ti = timers_state.cpu_clock_offset;
231 if (!timers_state.cpu_ticks_enabled) {
232 ti -= get_clock();
234 } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
236 return -ti;
239 /* enable cpu_get_ticks()
240 * Caller must hold BQL which server as mutex for vm_clock_seqlock.
242 void cpu_enable_ticks(void)
244 /* Here, the really thing protected by seqlock is cpu_clock_offset. */
245 seqlock_write_lock(&timers_state.vm_clock_seqlock);
246 if (!timers_state.cpu_ticks_enabled) {
247 timers_state.cpu_ticks_offset -= cpu_get_real_ticks();
248 timers_state.cpu_clock_offset -= get_clock();
249 timers_state.cpu_ticks_enabled = 1;
251 seqlock_write_unlock(&timers_state.vm_clock_seqlock);
254 /* disable cpu_get_ticks() : the clock is stopped. You must not call
255 * cpu_get_ticks() after that.
256 * Caller must hold BQL which server as mutex for vm_clock_seqlock.
258 void cpu_disable_ticks(void)
260 /* Here, the really thing protected by seqlock is cpu_clock_offset. */
261 seqlock_write_lock(&timers_state.vm_clock_seqlock);
262 if (timers_state.cpu_ticks_enabled) {
263 timers_state.cpu_ticks_offset += cpu_get_real_ticks();
264 timers_state.cpu_clock_offset = cpu_get_clock_locked();
265 timers_state.cpu_ticks_enabled = 0;
267 seqlock_write_unlock(&timers_state.vm_clock_seqlock);
270 /* Correlation between real and virtual time is always going to be
271 fairly approximate, so ignore small variation.
272 When the guest is idle real and virtual time will be aligned in
273 the IO wait loop. */
274 #define ICOUNT_WOBBLE (get_ticks_per_sec() / 10)
276 static void icount_adjust(void)
278 int64_t cur_time;
279 int64_t cur_icount;
280 int64_t delta;
282 /* Protected by TimersState mutex. */
283 static int64_t last_delta;
285 /* If the VM is not running, then do nothing. */
286 if (!runstate_is_running()) {
287 return;
290 seqlock_write_lock(&timers_state.vm_clock_seqlock);
291 cur_time = cpu_get_clock_locked();
292 cur_icount = cpu_get_icount_locked();
294 delta = cur_icount - cur_time;
295 /* FIXME: This is a very crude algorithm, somewhat prone to oscillation. */
296 if (delta > 0
297 && last_delta + ICOUNT_WOBBLE < delta * 2
298 && icount_time_shift > 0) {
299 /* The guest is getting too far ahead. Slow time down. */
300 icount_time_shift--;
302 if (delta < 0
303 && last_delta - ICOUNT_WOBBLE > delta * 2
304 && icount_time_shift < MAX_ICOUNT_SHIFT) {
305 /* The guest is getting too far behind. Speed time up. */
306 icount_time_shift++;
308 last_delta = delta;
309 timers_state.qemu_icount_bias = cur_icount
310 - (timers_state.qemu_icount << icount_time_shift);
311 seqlock_write_unlock(&timers_state.vm_clock_seqlock);
314 static void icount_adjust_rt(void *opaque)
316 timer_mod(icount_rt_timer,
317 qemu_clock_get_ms(QEMU_CLOCK_REALTIME) + 1000);
318 icount_adjust();
321 static void icount_adjust_vm(void *opaque)
323 timer_mod(icount_vm_timer,
324 qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
325 get_ticks_per_sec() / 10);
326 icount_adjust();
329 static int64_t qemu_icount_round(int64_t count)
331 return (count + (1 << icount_time_shift) - 1) >> icount_time_shift;
334 static void icount_warp_rt(void *opaque)
336 /* The icount_warp_timer is rescheduled soon after vm_clock_warp_start
337 * changes from -1 to another value, so the race here is okay.
339 if (atomic_read(&vm_clock_warp_start) == -1) {
340 return;
343 seqlock_write_lock(&timers_state.vm_clock_seqlock);
344 if (runstate_is_running()) {
345 int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
346 int64_t warp_delta;
348 warp_delta = clock - vm_clock_warp_start;
349 if (use_icount == 2) {
351 * In adaptive mode, do not let QEMU_CLOCK_VIRTUAL run too
352 * far ahead of real time.
354 int64_t cur_time = cpu_get_clock_locked();
355 int64_t cur_icount = cpu_get_icount_locked();
356 int64_t delta = cur_time - cur_icount;
357 warp_delta = MIN(warp_delta, delta);
359 timers_state.qemu_icount_bias += warp_delta;
361 vm_clock_warp_start = -1;
362 seqlock_write_unlock(&timers_state.vm_clock_seqlock);
364 if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL)) {
365 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
369 void qtest_clock_warp(int64_t dest)
371 int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
372 assert(qtest_enabled());
373 while (clock < dest) {
374 int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
375 int64_t warp = qemu_soonest_timeout(dest - clock, deadline);
376 seqlock_write_lock(&timers_state.vm_clock_seqlock);
377 timers_state.qemu_icount_bias += warp;
378 seqlock_write_unlock(&timers_state.vm_clock_seqlock);
380 qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
381 clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
383 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
386 void qemu_clock_warp(QEMUClockType type)
388 int64_t clock;
389 int64_t deadline;
392 * There are too many global variables to make the "warp" behavior
393 * applicable to other clocks. But a clock argument removes the
394 * need for if statements all over the place.
396 if (type != QEMU_CLOCK_VIRTUAL || !use_icount) {
397 return;
401 * If the CPUs have been sleeping, advance QEMU_CLOCK_VIRTUAL timer now.
402 * This ensures that the deadline for the timer is computed correctly below.
403 * This also makes sure that the insn counter is synchronized before the
404 * CPU starts running, in case the CPU is woken by an event other than
405 * the earliest QEMU_CLOCK_VIRTUAL timer.
407 icount_warp_rt(NULL);
408 timer_del(icount_warp_timer);
409 if (!all_cpu_threads_idle()) {
410 return;
413 if (qtest_enabled()) {
414 /* When testing, qtest commands advance icount. */
415 return;
418 /* We want to use the earliest deadline from ALL vm_clocks */
419 clock = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
420 deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
421 if (deadline < 0) {
422 return;
425 if (deadline > 0) {
427 * Ensure QEMU_CLOCK_VIRTUAL proceeds even when the virtual CPU goes to
428 * sleep. Otherwise, the CPU might be waiting for a future timer
429 * interrupt to wake it up, but the interrupt never comes because
430 * the vCPU isn't running any insns and thus doesn't advance the
431 * QEMU_CLOCK_VIRTUAL.
433 * An extreme solution for this problem would be to never let VCPUs
434 * sleep in icount mode if there is a pending QEMU_CLOCK_VIRTUAL
435 * timer; rather time could just advance to the next QEMU_CLOCK_VIRTUAL
436 * event. Instead, we do stop VCPUs and only advance QEMU_CLOCK_VIRTUAL
437 * after some e"real" time, (related to the time left until the next
438 * event) has passed. The QEMU_CLOCK_REALTIME timer will do this.
439 * This avoids that the warps are visible externally; for example,
440 * you will not be sending network packets continuously instead of
441 * every 100ms.
443 seqlock_write_lock(&timers_state.vm_clock_seqlock);
444 if (vm_clock_warp_start == -1 || vm_clock_warp_start > clock) {
445 vm_clock_warp_start = clock;
447 seqlock_write_unlock(&timers_state.vm_clock_seqlock);
448 timer_mod_anticipate(icount_warp_timer, clock + deadline);
449 } else if (deadline == 0) {
450 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
454 static bool icount_state_needed(void *opaque)
456 return use_icount;
460 * This is a subsection for icount migration.
462 static const VMStateDescription icount_vmstate_timers = {
463 .name = "timer/icount",
464 .version_id = 1,
465 .minimum_version_id = 1,
466 .fields = (VMStateField[]) {
467 VMSTATE_INT64(qemu_icount_bias, TimersState),
468 VMSTATE_INT64(qemu_icount, TimersState),
469 VMSTATE_END_OF_LIST()
473 static const VMStateDescription vmstate_timers = {
474 .name = "timer",
475 .version_id = 2,
476 .minimum_version_id = 1,
477 .fields = (VMStateField[]) {
478 VMSTATE_INT64(cpu_ticks_offset, TimersState),
479 VMSTATE_INT64(dummy, TimersState),
480 VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2),
481 VMSTATE_END_OF_LIST()
483 .subsections = (VMStateSubsection[]) {
485 .vmsd = &icount_vmstate_timers,
486 .needed = icount_state_needed,
487 }, {
488 /* empty */
493 void configure_icount(QemuOpts *opts, Error **errp)
495 const char *option;
496 char *rem_str = NULL;
498 seqlock_init(&timers_state.vm_clock_seqlock, NULL);
499 vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
500 option = qemu_opt_get(opts, "shift");
501 if (!option) {
502 if (qemu_opt_get(opts, "align") != NULL) {
503 error_setg(errp, "Please specify shift option when using align");
505 return;
507 icount_align_option = qemu_opt_get_bool(opts, "align", false);
508 icount_warp_timer = timer_new_ns(QEMU_CLOCK_REALTIME,
509 icount_warp_rt, NULL);
510 if (strcmp(option, "auto") != 0) {
511 errno = 0;
512 icount_time_shift = strtol(option, &rem_str, 0);
513 if (errno != 0 || *rem_str != '\0' || !strlen(option)) {
514 error_setg(errp, "icount: Invalid shift value");
516 use_icount = 1;
517 return;
518 } else if (icount_align_option) {
519 error_setg(errp, "shift=auto and align=on are incompatible");
522 use_icount = 2;
524 /* 125MIPS seems a reasonable initial guess at the guest speed.
525 It will be corrected fairly quickly anyway. */
526 icount_time_shift = 3;
528 /* Have both realtime and virtual time triggers for speed adjustment.
529 The realtime trigger catches emulated time passing too slowly,
530 the virtual time trigger catches emulated time passing too fast.
531 Realtime triggers occur even when idle, so use them less frequently
532 than VM triggers. */
533 icount_rt_timer = timer_new_ms(QEMU_CLOCK_REALTIME,
534 icount_adjust_rt, NULL);
535 timer_mod(icount_rt_timer,
536 qemu_clock_get_ms(QEMU_CLOCK_REALTIME) + 1000);
537 icount_vm_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
538 icount_adjust_vm, NULL);
539 timer_mod(icount_vm_timer,
540 qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
541 get_ticks_per_sec() / 10);
544 /***********************************************************/
545 void hw_error(const char *fmt, ...)
547 va_list ap;
548 CPUState *cpu;
550 va_start(ap, fmt);
551 fprintf(stderr, "qemu: hardware error: ");
552 vfprintf(stderr, fmt, ap);
553 fprintf(stderr, "\n");
554 CPU_FOREACH(cpu) {
555 fprintf(stderr, "CPU #%d:\n", cpu->cpu_index);
556 cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_FPU);
558 va_end(ap);
559 abort();
562 void cpu_synchronize_all_states(void)
564 CPUState *cpu;
566 CPU_FOREACH(cpu) {
567 cpu_synchronize_state(cpu);
571 void cpu_synchronize_all_post_reset(void)
573 CPUState *cpu;
575 CPU_FOREACH(cpu) {
576 cpu_synchronize_post_reset(cpu);
580 void cpu_synchronize_all_post_init(void)
582 CPUState *cpu;
584 CPU_FOREACH(cpu) {
585 cpu_synchronize_post_init(cpu);
589 static int do_vm_stop(RunState state)
591 int ret = 0;
593 if (runstate_is_running()) {
594 cpu_disable_ticks();
595 pause_all_vcpus();
596 runstate_set(state);
597 vm_state_notify(0, state);
598 qapi_event_send_stop(&error_abort);
601 bdrv_drain_all();
602 ret = bdrv_flush_all();
604 return ret;
607 static bool cpu_can_run(CPUState *cpu)
609 if (cpu->stop) {
610 return false;
612 if (cpu_is_stopped(cpu)) {
613 return false;
615 return true;
618 static void cpu_handle_guest_debug(CPUState *cpu)
620 gdb_set_stop_cpu(cpu);
621 qemu_system_debug_request();
622 cpu->stopped = true;
625 static void cpu_signal(int sig)
627 if (current_cpu) {
628 cpu_exit(current_cpu);
630 exit_request = 1;
633 #ifdef CONFIG_LINUX
634 static void sigbus_reraise(void)
636 sigset_t set;
637 struct sigaction action;
639 memset(&action, 0, sizeof(action));
640 action.sa_handler = SIG_DFL;
641 if (!sigaction(SIGBUS, &action, NULL)) {
642 raise(SIGBUS);
643 sigemptyset(&set);
644 sigaddset(&set, SIGBUS);
645 sigprocmask(SIG_UNBLOCK, &set, NULL);
647 perror("Failed to re-raise SIGBUS!\n");
648 abort();
651 static void sigbus_handler(int n, struct qemu_signalfd_siginfo *siginfo,
652 void *ctx)
654 if (kvm_on_sigbus(siginfo->ssi_code,
655 (void *)(intptr_t)siginfo->ssi_addr)) {
656 sigbus_reraise();
660 static void qemu_init_sigbus(void)
662 struct sigaction action;
664 memset(&action, 0, sizeof(action));
665 action.sa_flags = SA_SIGINFO;
666 action.sa_sigaction = (void (*)(int, siginfo_t*, void*))sigbus_handler;
667 sigaction(SIGBUS, &action, NULL);
669 prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0);
672 static void qemu_kvm_eat_signals(CPUState *cpu)
674 struct timespec ts = { 0, 0 };
675 siginfo_t siginfo;
676 sigset_t waitset;
677 sigset_t chkset;
678 int r;
680 sigemptyset(&waitset);
681 sigaddset(&waitset, SIG_IPI);
682 sigaddset(&waitset, SIGBUS);
684 do {
685 r = sigtimedwait(&waitset, &siginfo, &ts);
686 if (r == -1 && !(errno == EAGAIN || errno == EINTR)) {
687 perror("sigtimedwait");
688 exit(1);
691 switch (r) {
692 case SIGBUS:
693 if (kvm_on_sigbus_vcpu(cpu, siginfo.si_code, siginfo.si_addr)) {
694 sigbus_reraise();
696 break;
697 default:
698 break;
701 r = sigpending(&chkset);
702 if (r == -1) {
703 perror("sigpending");
704 exit(1);
706 } while (sigismember(&chkset, SIG_IPI) || sigismember(&chkset, SIGBUS));
709 #else /* !CONFIG_LINUX */
711 static void qemu_init_sigbus(void)
715 static void qemu_kvm_eat_signals(CPUState *cpu)
718 #endif /* !CONFIG_LINUX */
720 #ifndef _WIN32
721 static void dummy_signal(int sig)
725 static void qemu_kvm_init_cpu_signals(CPUState *cpu)
727 int r;
728 sigset_t set;
729 struct sigaction sigact;
731 memset(&sigact, 0, sizeof(sigact));
732 sigact.sa_handler = dummy_signal;
733 sigaction(SIG_IPI, &sigact, NULL);
735 pthread_sigmask(SIG_BLOCK, NULL, &set);
736 sigdelset(&set, SIG_IPI);
737 sigdelset(&set, SIGBUS);
738 r = kvm_set_signal_mask(cpu, &set);
739 if (r) {
740 fprintf(stderr, "kvm_set_signal_mask: %s\n", strerror(-r));
741 exit(1);
745 static void qemu_tcg_init_cpu_signals(void)
747 sigset_t set;
748 struct sigaction sigact;
750 memset(&sigact, 0, sizeof(sigact));
751 sigact.sa_handler = cpu_signal;
752 sigaction(SIG_IPI, &sigact, NULL);
754 sigemptyset(&set);
755 sigaddset(&set, SIG_IPI);
756 pthread_sigmask(SIG_UNBLOCK, &set, NULL);
759 #else /* _WIN32 */
760 static void qemu_kvm_init_cpu_signals(CPUState *cpu)
762 abort();
765 static void qemu_tcg_init_cpu_signals(void)
768 #endif /* _WIN32 */
770 static QemuMutex qemu_global_mutex;
771 static QemuCond qemu_io_proceeded_cond;
772 static bool iothread_requesting_mutex;
774 static QemuThread io_thread;
776 static QemuThread *tcg_cpu_thread;
777 static QemuCond *tcg_halt_cond;
779 /* cpu creation */
780 static QemuCond qemu_cpu_cond;
781 /* system init */
782 static QemuCond qemu_pause_cond;
783 static QemuCond qemu_work_cond;
785 void qemu_init_cpu_loop(void)
787 qemu_init_sigbus();
788 qemu_cond_init(&qemu_cpu_cond);
789 qemu_cond_init(&qemu_pause_cond);
790 qemu_cond_init(&qemu_work_cond);
791 qemu_cond_init(&qemu_io_proceeded_cond);
792 qemu_mutex_init(&qemu_global_mutex);
794 qemu_thread_get_self(&io_thread);
797 void run_on_cpu(CPUState *cpu, void (*func)(void *data), void *data)
799 struct qemu_work_item wi;
801 if (qemu_cpu_is_self(cpu)) {
802 func(data);
803 return;
806 wi.func = func;
807 wi.data = data;
808 wi.free = false;
809 if (cpu->queued_work_first == NULL) {
810 cpu->queued_work_first = &wi;
811 } else {
812 cpu->queued_work_last->next = &wi;
814 cpu->queued_work_last = &wi;
815 wi.next = NULL;
816 wi.done = false;
818 qemu_cpu_kick(cpu);
819 while (!wi.done) {
820 CPUState *self_cpu = current_cpu;
822 qemu_cond_wait(&qemu_work_cond, &qemu_global_mutex);
823 current_cpu = self_cpu;
827 void async_run_on_cpu(CPUState *cpu, void (*func)(void *data), void *data)
829 struct qemu_work_item *wi;
831 if (qemu_cpu_is_self(cpu)) {
832 func(data);
833 return;
836 wi = g_malloc0(sizeof(struct qemu_work_item));
837 wi->func = func;
838 wi->data = data;
839 wi->free = true;
840 if (cpu->queued_work_first == NULL) {
841 cpu->queued_work_first = wi;
842 } else {
843 cpu->queued_work_last->next = wi;
845 cpu->queued_work_last = wi;
846 wi->next = NULL;
847 wi->done = false;
849 qemu_cpu_kick(cpu);
852 static void flush_queued_work(CPUState *cpu)
854 struct qemu_work_item *wi;
856 if (cpu->queued_work_first == NULL) {
857 return;
860 while ((wi = cpu->queued_work_first)) {
861 cpu->queued_work_first = wi->next;
862 wi->func(wi->data);
863 wi->done = true;
864 if (wi->free) {
865 g_free(wi);
868 cpu->queued_work_last = NULL;
869 qemu_cond_broadcast(&qemu_work_cond);
872 static void qemu_wait_io_event_common(CPUState *cpu)
874 if (cpu->stop) {
875 cpu->stop = false;
876 cpu->stopped = true;
877 qemu_cond_signal(&qemu_pause_cond);
879 flush_queued_work(cpu);
880 cpu->thread_kicked = false;
883 static void qemu_tcg_wait_io_event(void)
885 CPUState *cpu;
887 while (all_cpu_threads_idle()) {
888 /* Start accounting real time to the virtual clock if the CPUs
889 are idle. */
890 qemu_clock_warp(QEMU_CLOCK_VIRTUAL);
891 qemu_cond_wait(tcg_halt_cond, &qemu_global_mutex);
894 while (iothread_requesting_mutex) {
895 qemu_cond_wait(&qemu_io_proceeded_cond, &qemu_global_mutex);
898 CPU_FOREACH(cpu) {
899 qemu_wait_io_event_common(cpu);
903 static void qemu_kvm_wait_io_event(CPUState *cpu)
905 while (cpu_thread_is_idle(cpu)) {
906 qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
909 qemu_kvm_eat_signals(cpu);
910 qemu_wait_io_event_common(cpu);
913 static void *qemu_kvm_cpu_thread_fn(void *arg)
915 CPUState *cpu = arg;
916 int r;
918 qemu_mutex_lock(&qemu_global_mutex);
919 qemu_thread_get_self(cpu->thread);
920 cpu->thread_id = qemu_get_thread_id();
921 current_cpu = cpu;
923 r = kvm_init_vcpu(cpu);
924 if (r < 0) {
925 fprintf(stderr, "kvm_init_vcpu failed: %s\n", strerror(-r));
926 exit(1);
929 qemu_kvm_init_cpu_signals(cpu);
931 /* signal CPU creation */
932 cpu->created = true;
933 qemu_cond_signal(&qemu_cpu_cond);
935 while (1) {
936 if (cpu_can_run(cpu)) {
937 r = kvm_cpu_exec(cpu);
938 if (r == EXCP_DEBUG) {
939 cpu_handle_guest_debug(cpu);
942 qemu_kvm_wait_io_event(cpu);
945 return NULL;
948 static void *qemu_dummy_cpu_thread_fn(void *arg)
950 #ifdef _WIN32
951 fprintf(stderr, "qtest is not supported under Windows\n");
952 exit(1);
953 #else
954 CPUState *cpu = arg;
955 sigset_t waitset;
956 int r;
958 qemu_mutex_lock_iothread();
959 qemu_thread_get_self(cpu->thread);
960 cpu->thread_id = qemu_get_thread_id();
962 sigemptyset(&waitset);
963 sigaddset(&waitset, SIG_IPI);
965 /* signal CPU creation */
966 cpu->created = true;
967 qemu_cond_signal(&qemu_cpu_cond);
969 current_cpu = cpu;
970 while (1) {
971 current_cpu = NULL;
972 qemu_mutex_unlock_iothread();
973 do {
974 int sig;
975 r = sigwait(&waitset, &sig);
976 } while (r == -1 && (errno == EAGAIN || errno == EINTR));
977 if (r == -1) {
978 perror("sigwait");
979 exit(1);
981 qemu_mutex_lock_iothread();
982 current_cpu = cpu;
983 qemu_wait_io_event_common(cpu);
986 return NULL;
987 #endif
990 static void tcg_exec_all(void);
992 static void *qemu_tcg_cpu_thread_fn(void *arg)
994 CPUState *cpu = arg;
996 qemu_tcg_init_cpu_signals();
997 qemu_thread_get_self(cpu->thread);
999 qemu_mutex_lock(&qemu_global_mutex);
1000 CPU_FOREACH(cpu) {
1001 cpu->thread_id = qemu_get_thread_id();
1002 cpu->created = true;
1004 qemu_cond_signal(&qemu_cpu_cond);
1006 /* wait for initial kick-off after machine start */
1007 while (QTAILQ_FIRST(&cpus)->stopped) {
1008 qemu_cond_wait(tcg_halt_cond, &qemu_global_mutex);
1010 /* process any pending work */
1011 CPU_FOREACH(cpu) {
1012 qemu_wait_io_event_common(cpu);
1016 while (1) {
1017 tcg_exec_all();
1019 if (use_icount) {
1020 int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1022 if (deadline == 0) {
1023 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
1026 qemu_tcg_wait_io_event();
1029 return NULL;
1032 static void qemu_cpu_kick_thread(CPUState *cpu)
1034 #ifndef _WIN32
1035 int err;
1037 err = pthread_kill(cpu->thread->thread, SIG_IPI);
1038 if (err) {
1039 fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
1040 exit(1);
1042 #else /* _WIN32 */
1043 if (!qemu_cpu_is_self(cpu)) {
1044 CONTEXT tcgContext;
1046 if (SuspendThread(cpu->hThread) == (DWORD)-1) {
1047 fprintf(stderr, "qemu:%s: GetLastError:%lu\n", __func__,
1048 GetLastError());
1049 exit(1);
1052 /* On multi-core systems, we are not sure that the thread is actually
1053 * suspended until we can get the context.
1055 tcgContext.ContextFlags = CONTEXT_CONTROL;
1056 while (GetThreadContext(cpu->hThread, &tcgContext) != 0) {
1057 continue;
1060 cpu_signal(0);
1062 if (ResumeThread(cpu->hThread) == (DWORD)-1) {
1063 fprintf(stderr, "qemu:%s: GetLastError:%lu\n", __func__,
1064 GetLastError());
1065 exit(1);
1068 #endif
1071 void qemu_cpu_kick(CPUState *cpu)
1073 qemu_cond_broadcast(cpu->halt_cond);
1074 if (!tcg_enabled() && !cpu->thread_kicked) {
1075 qemu_cpu_kick_thread(cpu);
1076 cpu->thread_kicked = true;
1080 void qemu_cpu_kick_self(void)
1082 #ifndef _WIN32
1083 assert(current_cpu);
1085 if (!current_cpu->thread_kicked) {
1086 qemu_cpu_kick_thread(current_cpu);
1087 current_cpu->thread_kicked = true;
1089 #else
1090 abort();
1091 #endif
1094 bool qemu_cpu_is_self(CPUState *cpu)
1096 return qemu_thread_is_self(cpu->thread);
1099 static bool qemu_in_vcpu_thread(void)
1101 return current_cpu && qemu_cpu_is_self(current_cpu);
1104 void qemu_mutex_lock_iothread(void)
1106 if (!tcg_enabled()) {
1107 qemu_mutex_lock(&qemu_global_mutex);
1108 } else {
1109 iothread_requesting_mutex = true;
1110 if (qemu_mutex_trylock(&qemu_global_mutex)) {
1111 qemu_cpu_kick_thread(first_cpu);
1112 qemu_mutex_lock(&qemu_global_mutex);
1114 iothread_requesting_mutex = false;
1115 qemu_cond_broadcast(&qemu_io_proceeded_cond);
1119 void qemu_mutex_unlock_iothread(void)
1121 qemu_mutex_unlock(&qemu_global_mutex);
1124 static int all_vcpus_paused(void)
1126 CPUState *cpu;
1128 CPU_FOREACH(cpu) {
1129 if (!cpu->stopped) {
1130 return 0;
1134 return 1;
1137 void pause_all_vcpus(void)
1139 CPUState *cpu;
1141 qemu_clock_enable(QEMU_CLOCK_VIRTUAL, false);
1142 CPU_FOREACH(cpu) {
1143 cpu->stop = true;
1144 qemu_cpu_kick(cpu);
1147 if (qemu_in_vcpu_thread()) {
1148 cpu_stop_current();
1149 if (!kvm_enabled()) {
1150 CPU_FOREACH(cpu) {
1151 cpu->stop = false;
1152 cpu->stopped = true;
1154 return;
1158 while (!all_vcpus_paused()) {
1159 qemu_cond_wait(&qemu_pause_cond, &qemu_global_mutex);
1160 CPU_FOREACH(cpu) {
1161 qemu_cpu_kick(cpu);
1166 void cpu_resume(CPUState *cpu)
1168 cpu->stop = false;
1169 cpu->stopped = false;
1170 qemu_cpu_kick(cpu);
1173 void resume_all_vcpus(void)
1175 CPUState *cpu;
1177 qemu_clock_enable(QEMU_CLOCK_VIRTUAL, true);
1178 CPU_FOREACH(cpu) {
1179 cpu_resume(cpu);
1183 /* For temporary buffers for forming a name */
1184 #define VCPU_THREAD_NAME_SIZE 16
1186 static void qemu_tcg_init_vcpu(CPUState *cpu)
1188 char thread_name[VCPU_THREAD_NAME_SIZE];
1190 tcg_cpu_address_space_init(cpu, cpu->as);
1192 /* share a single thread for all cpus with TCG */
1193 if (!tcg_cpu_thread) {
1194 cpu->thread = g_malloc0(sizeof(QemuThread));
1195 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1196 qemu_cond_init(cpu->halt_cond);
1197 tcg_halt_cond = cpu->halt_cond;
1198 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
1199 cpu->cpu_index);
1200 qemu_thread_create(cpu->thread, thread_name, qemu_tcg_cpu_thread_fn,
1201 cpu, QEMU_THREAD_JOINABLE);
1202 #ifdef _WIN32
1203 cpu->hThread = qemu_thread_get_handle(cpu->thread);
1204 #endif
1205 while (!cpu->created) {
1206 qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1208 tcg_cpu_thread = cpu->thread;
1209 } else {
1210 cpu->thread = tcg_cpu_thread;
1211 cpu->halt_cond = tcg_halt_cond;
1215 static void qemu_kvm_start_vcpu(CPUState *cpu)
1217 char thread_name[VCPU_THREAD_NAME_SIZE];
1219 cpu->thread = g_malloc0(sizeof(QemuThread));
1220 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1221 qemu_cond_init(cpu->halt_cond);
1222 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/KVM",
1223 cpu->cpu_index);
1224 qemu_thread_create(cpu->thread, thread_name, qemu_kvm_cpu_thread_fn,
1225 cpu, QEMU_THREAD_JOINABLE);
1226 while (!cpu->created) {
1227 qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1231 static void qemu_dummy_start_vcpu(CPUState *cpu)
1233 char thread_name[VCPU_THREAD_NAME_SIZE];
1235 cpu->thread = g_malloc0(sizeof(QemuThread));
1236 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1237 qemu_cond_init(cpu->halt_cond);
1238 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/DUMMY",
1239 cpu->cpu_index);
1240 qemu_thread_create(cpu->thread, thread_name, qemu_dummy_cpu_thread_fn, cpu,
1241 QEMU_THREAD_JOINABLE);
1242 while (!cpu->created) {
1243 qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1247 void qemu_init_vcpu(CPUState *cpu)
1249 cpu->nr_cores = smp_cores;
1250 cpu->nr_threads = smp_threads;
1251 cpu->stopped = true;
1252 if (kvm_enabled()) {
1253 qemu_kvm_start_vcpu(cpu);
1254 } else if (tcg_enabled()) {
1255 qemu_tcg_init_vcpu(cpu);
1256 } else {
1257 qemu_dummy_start_vcpu(cpu);
1261 void cpu_stop_current(void)
1263 if (current_cpu) {
1264 current_cpu->stop = false;
1265 current_cpu->stopped = true;
1266 cpu_exit(current_cpu);
1267 qemu_cond_signal(&qemu_pause_cond);
1271 int vm_stop(RunState state)
1273 if (qemu_in_vcpu_thread()) {
1274 qemu_system_vmstop_request_prepare();
1275 qemu_system_vmstop_request(state);
1277 * FIXME: should not return to device code in case
1278 * vm_stop() has been requested.
1280 cpu_stop_current();
1281 return 0;
1284 return do_vm_stop(state);
1287 /* does a state transition even if the VM is already stopped,
1288 current state is forgotten forever */
1289 int vm_stop_force_state(RunState state)
1291 if (runstate_is_running()) {
1292 return vm_stop(state);
1293 } else {
1294 runstate_set(state);
1295 /* Make sure to return an error if the flush in a previous vm_stop()
1296 * failed. */
1297 return bdrv_flush_all();
1301 static int tcg_cpu_exec(CPUArchState *env)
1303 CPUState *cpu = ENV_GET_CPU(env);
1304 int ret;
1305 #ifdef CONFIG_PROFILER
1306 int64_t ti;
1307 #endif
1309 #ifdef CONFIG_PROFILER
1310 ti = profile_getclock();
1311 #endif
1312 if (use_icount) {
1313 int64_t count;
1314 int64_t deadline;
1315 int decr;
1316 timers_state.qemu_icount -= (cpu->icount_decr.u16.low
1317 + cpu->icount_extra);
1318 cpu->icount_decr.u16.low = 0;
1319 cpu->icount_extra = 0;
1320 deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1322 /* Maintain prior (possibly buggy) behaviour where if no deadline
1323 * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
1324 * INT32_MAX nanoseconds ahead, we still use INT32_MAX
1325 * nanoseconds.
1327 if ((deadline < 0) || (deadline > INT32_MAX)) {
1328 deadline = INT32_MAX;
1331 count = qemu_icount_round(deadline);
1332 timers_state.qemu_icount += count;
1333 decr = (count > 0xffff) ? 0xffff : count;
1334 count -= decr;
1335 cpu->icount_decr.u16.low = decr;
1336 cpu->icount_extra = count;
1338 ret = cpu_exec(env);
1339 #ifdef CONFIG_PROFILER
1340 qemu_time += profile_getclock() - ti;
1341 #endif
1342 if (use_icount) {
1343 /* Fold pending instructions back into the
1344 instruction counter, and clear the interrupt flag. */
1345 timers_state.qemu_icount -= (cpu->icount_decr.u16.low
1346 + cpu->icount_extra);
1347 cpu->icount_decr.u32 = 0;
1348 cpu->icount_extra = 0;
1350 return ret;
1353 static void tcg_exec_all(void)
1355 int r;
1357 /* Account partial waits to QEMU_CLOCK_VIRTUAL. */
1358 qemu_clock_warp(QEMU_CLOCK_VIRTUAL);
1360 if (next_cpu == NULL) {
1361 next_cpu = first_cpu;
1363 for (; next_cpu != NULL && !exit_request; next_cpu = CPU_NEXT(next_cpu)) {
1364 CPUState *cpu = next_cpu;
1365 CPUArchState *env = cpu->env_ptr;
1367 qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
1368 (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
1370 if (cpu_can_run(cpu)) {
1371 r = tcg_cpu_exec(env);
1372 if (r == EXCP_DEBUG) {
1373 cpu_handle_guest_debug(cpu);
1374 break;
1376 } else if (cpu->stop || cpu->stopped) {
1377 break;
1380 exit_request = 0;
1383 void list_cpus(FILE *f, fprintf_function cpu_fprintf, const char *optarg)
1385 /* XXX: implement xxx_cpu_list for targets that still miss it */
1386 #if defined(cpu_list)
1387 cpu_list(f, cpu_fprintf);
1388 #endif
1391 CpuInfoList *qmp_query_cpus(Error **errp)
1393 CpuInfoList *head = NULL, *cur_item = NULL;
1394 CPUState *cpu;
1396 CPU_FOREACH(cpu) {
1397 CpuInfoList *info;
1398 #if defined(TARGET_I386)
1399 X86CPU *x86_cpu = X86_CPU(cpu);
1400 CPUX86State *env = &x86_cpu->env;
1401 #elif defined(TARGET_PPC)
1402 PowerPCCPU *ppc_cpu = POWERPC_CPU(cpu);
1403 CPUPPCState *env = &ppc_cpu->env;
1404 #elif defined(TARGET_SPARC)
1405 SPARCCPU *sparc_cpu = SPARC_CPU(cpu);
1406 CPUSPARCState *env = &sparc_cpu->env;
1407 #elif defined(TARGET_MIPS)
1408 MIPSCPU *mips_cpu = MIPS_CPU(cpu);
1409 CPUMIPSState *env = &mips_cpu->env;
1410 #endif
1412 cpu_synchronize_state(cpu);
1414 info = g_malloc0(sizeof(*info));
1415 info->value = g_malloc0(sizeof(*info->value));
1416 info->value->CPU = cpu->cpu_index;
1417 info->value->current = (cpu == first_cpu);
1418 info->value->halted = cpu->halted;
1419 info->value->thread_id = cpu->thread_id;
1420 #if defined(TARGET_I386)
1421 info->value->has_pc = true;
1422 info->value->pc = env->eip + env->segs[R_CS].base;
1423 #elif defined(TARGET_PPC)
1424 info->value->has_nip = true;
1425 info->value->nip = env->nip;
1426 #elif defined(TARGET_SPARC)
1427 info->value->has_pc = true;
1428 info->value->pc = env->pc;
1429 info->value->has_npc = true;
1430 info->value->npc = env->npc;
1431 #elif defined(TARGET_MIPS)
1432 info->value->has_PC = true;
1433 info->value->PC = env->active_tc.PC;
1434 #endif
1436 /* XXX: waiting for the qapi to support GSList */
1437 if (!cur_item) {
1438 head = cur_item = info;
1439 } else {
1440 cur_item->next = info;
1441 cur_item = info;
1445 return head;
1448 void qmp_memsave(int64_t addr, int64_t size, const char *filename,
1449 bool has_cpu, int64_t cpu_index, Error **errp)
1451 FILE *f;
1452 uint32_t l;
1453 CPUState *cpu;
1454 uint8_t buf[1024];
1456 if (!has_cpu) {
1457 cpu_index = 0;
1460 cpu = qemu_get_cpu(cpu_index);
1461 if (cpu == NULL) {
1462 error_set(errp, QERR_INVALID_PARAMETER_VALUE, "cpu-index",
1463 "a CPU number");
1464 return;
1467 f = fopen(filename, "wb");
1468 if (!f) {
1469 error_setg_file_open(errp, errno, filename);
1470 return;
1473 while (size != 0) {
1474 l = sizeof(buf);
1475 if (l > size)
1476 l = size;
1477 if (cpu_memory_rw_debug(cpu, addr, buf, l, 0) != 0) {
1478 error_setg(errp, "Invalid addr 0x%016" PRIx64 "specified", addr);
1479 goto exit;
1481 if (fwrite(buf, 1, l, f) != l) {
1482 error_set(errp, QERR_IO_ERROR);
1483 goto exit;
1485 addr += l;
1486 size -= l;
1489 exit:
1490 fclose(f);
1493 void qmp_pmemsave(int64_t addr, int64_t size, const char *filename,
1494 Error **errp)
1496 FILE *f;
1497 uint32_t l;
1498 uint8_t buf[1024];
1500 f = fopen(filename, "wb");
1501 if (!f) {
1502 error_setg_file_open(errp, errno, filename);
1503 return;
1506 while (size != 0) {
1507 l = sizeof(buf);
1508 if (l > size)
1509 l = size;
1510 cpu_physical_memory_read(addr, buf, l);
1511 if (fwrite(buf, 1, l, f) != l) {
1512 error_set(errp, QERR_IO_ERROR);
1513 goto exit;
1515 addr += l;
1516 size -= l;
1519 exit:
1520 fclose(f);
1523 void qmp_inject_nmi(Error **errp)
1525 #if defined(TARGET_I386)
1526 CPUState *cs;
1528 CPU_FOREACH(cs) {
1529 X86CPU *cpu = X86_CPU(cs);
1531 if (!cpu->apic_state) {
1532 cpu_interrupt(cs, CPU_INTERRUPT_NMI);
1533 } else {
1534 apic_deliver_nmi(cpu->apic_state);
1537 #elif defined(TARGET_S390X)
1538 CPUState *cs;
1539 S390CPU *cpu;
1541 CPU_FOREACH(cs) {
1542 cpu = S390_CPU(cs);
1543 if (cpu->env.cpu_num == monitor_get_cpu_index()) {
1544 if (s390_cpu_restart(S390_CPU(cs)) == -1) {
1545 error_set(errp, QERR_UNSUPPORTED);
1546 return;
1548 break;
1551 #else
1552 error_set(errp, QERR_UNSUPPORTED);
1553 #endif