KVM: Fix GSI number space limit
[qemu/rayw.git] / cpus.c
blobdd7ac136215170bc17d3a6aef9a3d32b7c9dff71
1 /*
2 * QEMU System Emulator
4 * Copyright (c) 2003-2008 Fabrice Bellard
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
25 /* Needed early for CONFIG_BSD etc. */
26 #include "config-host.h"
28 #include "monitor/monitor.h"
29 #include "sysemu/sysemu.h"
30 #include "exec/gdbstub.h"
31 #include "sysemu/dma.h"
32 #include "sysemu/kvm.h"
33 #include "qmp-commands.h"
35 #include "qemu/thread.h"
36 #include "sysemu/cpus.h"
37 #include "sysemu/qtest.h"
38 #include "qemu/main-loop.h"
39 #include "qemu/bitmap.h"
40 #include "qemu/seqlock.h"
42 #ifndef _WIN32
43 #include "qemu/compatfd.h"
44 #endif
46 #ifdef CONFIG_LINUX
48 #include <sys/prctl.h>
50 #ifndef PR_MCE_KILL
51 #define PR_MCE_KILL 33
52 #endif
54 #ifndef PR_MCE_KILL_SET
55 #define PR_MCE_KILL_SET 1
56 #endif
58 #ifndef PR_MCE_KILL_EARLY
59 #define PR_MCE_KILL_EARLY 1
60 #endif
62 #endif /* CONFIG_LINUX */
64 static CPUState *next_cpu;
66 bool cpu_is_stopped(CPUState *cpu)
68 return cpu->stopped || !runstate_is_running();
71 static bool cpu_thread_is_idle(CPUState *cpu)
73 if (cpu->stop || cpu->queued_work_first) {
74 return false;
76 if (cpu_is_stopped(cpu)) {
77 return true;
79 if (!cpu->halted || cpu_has_work(cpu) ||
80 kvm_halt_in_kernel()) {
81 return false;
83 return true;
86 static bool all_cpu_threads_idle(void)
88 CPUState *cpu;
90 CPU_FOREACH(cpu) {
91 if (!cpu_thread_is_idle(cpu)) {
92 return false;
95 return true;
98 /***********************************************************/
99 /* guest cycle counter */
101 /* Protected by TimersState seqlock */
103 /* Compensate for varying guest execution speed. */
104 static int64_t qemu_icount_bias;
105 static int64_t vm_clock_warp_start;
106 /* Conversion factor from emulated instructions to virtual clock ticks. */
107 static int icount_time_shift;
108 /* Arbitrarily pick 1MIPS as the minimum allowable speed. */
109 #define MAX_ICOUNT_SHIFT 10
111 /* Only written by TCG thread */
112 static int64_t qemu_icount;
114 static QEMUTimer *icount_rt_timer;
115 static QEMUTimer *icount_vm_timer;
116 static QEMUTimer *icount_warp_timer;
118 typedef struct TimersState {
119 /* Protected by BQL. */
120 int64_t cpu_ticks_prev;
121 int64_t cpu_ticks_offset;
123 /* cpu_clock_offset can be read out of BQL, so protect it with
124 * this lock.
126 QemuSeqLock vm_clock_seqlock;
127 int64_t cpu_clock_offset;
128 int32_t cpu_ticks_enabled;
129 int64_t dummy;
130 } TimersState;
132 static TimersState timers_state;
134 /* Return the virtual CPU time, based on the instruction counter. */
135 static int64_t cpu_get_icount_locked(void)
137 int64_t icount;
138 CPUState *cpu = current_cpu;
140 icount = qemu_icount;
141 if (cpu) {
142 if (!cpu_can_do_io(cpu)) {
143 fprintf(stderr, "Bad clock read\n");
145 icount -= (cpu->icount_decr.u16.low + cpu->icount_extra);
147 return qemu_icount_bias + (icount << icount_time_shift);
150 int64_t cpu_get_icount(void)
152 int64_t icount;
153 unsigned start;
155 do {
156 start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
157 icount = cpu_get_icount_locked();
158 } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
160 return icount;
163 /* return the host CPU cycle counter and handle stop/restart */
164 /* Caller must hold the BQL */
165 int64_t cpu_get_ticks(void)
167 int64_t ticks;
169 if (use_icount) {
170 return cpu_get_icount();
173 ticks = timers_state.cpu_ticks_offset;
174 if (timers_state.cpu_ticks_enabled) {
175 ticks += cpu_get_real_ticks();
178 if (timers_state.cpu_ticks_prev > ticks) {
179 /* Note: non increasing ticks may happen if the host uses
180 software suspend */
181 timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks;
182 ticks = timers_state.cpu_ticks_prev;
185 timers_state.cpu_ticks_prev = ticks;
186 return ticks;
189 static int64_t cpu_get_clock_locked(void)
191 int64_t ticks;
193 ticks = timers_state.cpu_clock_offset;
194 if (timers_state.cpu_ticks_enabled) {
195 ticks += get_clock();
198 return ticks;
201 /* return the host CPU monotonic timer and handle stop/restart */
202 int64_t cpu_get_clock(void)
204 int64_t ti;
205 unsigned start;
207 do {
208 start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
209 ti = cpu_get_clock_locked();
210 } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
212 return ti;
215 /* enable cpu_get_ticks()
216 * Caller must hold BQL which server as mutex for vm_clock_seqlock.
218 void cpu_enable_ticks(void)
220 /* Here, the really thing protected by seqlock is cpu_clock_offset. */
221 seqlock_write_lock(&timers_state.vm_clock_seqlock);
222 if (!timers_state.cpu_ticks_enabled) {
223 timers_state.cpu_ticks_offset -= cpu_get_real_ticks();
224 timers_state.cpu_clock_offset -= get_clock();
225 timers_state.cpu_ticks_enabled = 1;
227 seqlock_write_unlock(&timers_state.vm_clock_seqlock);
230 /* disable cpu_get_ticks() : the clock is stopped. You must not call
231 * cpu_get_ticks() after that.
232 * Caller must hold BQL which server as mutex for vm_clock_seqlock.
234 void cpu_disable_ticks(void)
236 /* Here, the really thing protected by seqlock is cpu_clock_offset. */
237 seqlock_write_lock(&timers_state.vm_clock_seqlock);
238 if (timers_state.cpu_ticks_enabled) {
239 timers_state.cpu_ticks_offset += cpu_get_real_ticks();
240 timers_state.cpu_clock_offset = cpu_get_clock_locked();
241 timers_state.cpu_ticks_enabled = 0;
243 seqlock_write_unlock(&timers_state.vm_clock_seqlock);
246 /* Correlation between real and virtual time is always going to be
247 fairly approximate, so ignore small variation.
248 When the guest is idle real and virtual time will be aligned in
249 the IO wait loop. */
250 #define ICOUNT_WOBBLE (get_ticks_per_sec() / 10)
252 static void icount_adjust(void)
254 int64_t cur_time;
255 int64_t cur_icount;
256 int64_t delta;
258 /* Protected by TimersState mutex. */
259 static int64_t last_delta;
261 /* If the VM is not running, then do nothing. */
262 if (!runstate_is_running()) {
263 return;
266 seqlock_write_lock(&timers_state.vm_clock_seqlock);
267 cur_time = cpu_get_clock_locked();
268 cur_icount = cpu_get_icount_locked();
270 delta = cur_icount - cur_time;
271 /* FIXME: This is a very crude algorithm, somewhat prone to oscillation. */
272 if (delta > 0
273 && last_delta + ICOUNT_WOBBLE < delta * 2
274 && icount_time_shift > 0) {
275 /* The guest is getting too far ahead. Slow time down. */
276 icount_time_shift--;
278 if (delta < 0
279 && last_delta - ICOUNT_WOBBLE > delta * 2
280 && icount_time_shift < MAX_ICOUNT_SHIFT) {
281 /* The guest is getting too far behind. Speed time up. */
282 icount_time_shift++;
284 last_delta = delta;
285 qemu_icount_bias = cur_icount - (qemu_icount << icount_time_shift);
286 seqlock_write_unlock(&timers_state.vm_clock_seqlock);
289 static void icount_adjust_rt(void *opaque)
291 timer_mod(icount_rt_timer,
292 qemu_clock_get_ms(QEMU_CLOCK_REALTIME) + 1000);
293 icount_adjust();
296 static void icount_adjust_vm(void *opaque)
298 timer_mod(icount_vm_timer,
299 qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
300 get_ticks_per_sec() / 10);
301 icount_adjust();
304 static int64_t qemu_icount_round(int64_t count)
306 return (count + (1 << icount_time_shift) - 1) >> icount_time_shift;
309 static void icount_warp_rt(void *opaque)
311 /* The icount_warp_timer is rescheduled soon after vm_clock_warp_start
312 * changes from -1 to another value, so the race here is okay.
314 if (atomic_read(&vm_clock_warp_start) == -1) {
315 return;
318 seqlock_write_lock(&timers_state.vm_clock_seqlock);
319 if (runstate_is_running()) {
320 int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
321 int64_t warp_delta;
323 warp_delta = clock - vm_clock_warp_start;
324 if (use_icount == 2) {
326 * In adaptive mode, do not let QEMU_CLOCK_VIRTUAL run too
327 * far ahead of real time.
329 int64_t cur_time = cpu_get_clock_locked();
330 int64_t cur_icount = cpu_get_icount_locked();
331 int64_t delta = cur_time - cur_icount;
332 warp_delta = MIN(warp_delta, delta);
334 qemu_icount_bias += warp_delta;
336 vm_clock_warp_start = -1;
337 seqlock_write_unlock(&timers_state.vm_clock_seqlock);
339 if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL)) {
340 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
344 void qtest_clock_warp(int64_t dest)
346 int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
347 assert(qtest_enabled());
348 while (clock < dest) {
349 int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
350 int64_t warp = MIN(dest - clock, deadline);
351 seqlock_write_lock(&timers_state.vm_clock_seqlock);
352 qemu_icount_bias += warp;
353 seqlock_write_unlock(&timers_state.vm_clock_seqlock);
355 qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
356 clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
358 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
361 void qemu_clock_warp(QEMUClockType type)
363 int64_t clock;
364 int64_t deadline;
367 * There are too many global variables to make the "warp" behavior
368 * applicable to other clocks. But a clock argument removes the
369 * need for if statements all over the place.
371 if (type != QEMU_CLOCK_VIRTUAL || !use_icount) {
372 return;
376 * If the CPUs have been sleeping, advance QEMU_CLOCK_VIRTUAL timer now.
377 * This ensures that the deadline for the timer is computed correctly below.
378 * This also makes sure that the insn counter is synchronized before the
379 * CPU starts running, in case the CPU is woken by an event other than
380 * the earliest QEMU_CLOCK_VIRTUAL timer.
382 icount_warp_rt(NULL);
383 timer_del(icount_warp_timer);
384 if (!all_cpu_threads_idle()) {
385 return;
388 if (qtest_enabled()) {
389 /* When testing, qtest commands advance icount. */
390 return;
393 /* We want to use the earliest deadline from ALL vm_clocks */
394 clock = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
395 deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
396 if (deadline < 0) {
397 return;
400 if (deadline > 0) {
402 * Ensure QEMU_CLOCK_VIRTUAL proceeds even when the virtual CPU goes to
403 * sleep. Otherwise, the CPU might be waiting for a future timer
404 * interrupt to wake it up, but the interrupt never comes because
405 * the vCPU isn't running any insns and thus doesn't advance the
406 * QEMU_CLOCK_VIRTUAL.
408 * An extreme solution for this problem would be to never let VCPUs
409 * sleep in icount mode if there is a pending QEMU_CLOCK_VIRTUAL
410 * timer; rather time could just advance to the next QEMU_CLOCK_VIRTUAL
411 * event. Instead, we do stop VCPUs and only advance QEMU_CLOCK_VIRTUAL
412 * after some e"real" time, (related to the time left until the next
413 * event) has passed. The QEMU_CLOCK_REALTIME timer will do this.
414 * This avoids that the warps are visible externally; for example,
415 * you will not be sending network packets continuously instead of
416 * every 100ms.
418 seqlock_write_lock(&timers_state.vm_clock_seqlock);
419 if (vm_clock_warp_start == -1 || vm_clock_warp_start > clock) {
420 vm_clock_warp_start = clock;
422 seqlock_write_unlock(&timers_state.vm_clock_seqlock);
423 timer_mod_anticipate(icount_warp_timer, clock + deadline);
424 } else if (deadline == 0) {
425 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
429 static const VMStateDescription vmstate_timers = {
430 .name = "timer",
431 .version_id = 2,
432 .minimum_version_id = 1,
433 .fields = (VMStateField[]) {
434 VMSTATE_INT64(cpu_ticks_offset, TimersState),
435 VMSTATE_INT64(dummy, TimersState),
436 VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2),
437 VMSTATE_END_OF_LIST()
441 void configure_icount(const char *option)
443 seqlock_init(&timers_state.vm_clock_seqlock, NULL);
444 vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
445 if (!option) {
446 return;
449 icount_warp_timer = timer_new_ns(QEMU_CLOCK_REALTIME,
450 icount_warp_rt, NULL);
451 if (strcmp(option, "auto") != 0) {
452 icount_time_shift = strtol(option, NULL, 0);
453 use_icount = 1;
454 return;
457 use_icount = 2;
459 /* 125MIPS seems a reasonable initial guess at the guest speed.
460 It will be corrected fairly quickly anyway. */
461 icount_time_shift = 3;
463 /* Have both realtime and virtual time triggers for speed adjustment.
464 The realtime trigger catches emulated time passing too slowly,
465 the virtual time trigger catches emulated time passing too fast.
466 Realtime triggers occur even when idle, so use them less frequently
467 than VM triggers. */
468 icount_rt_timer = timer_new_ms(QEMU_CLOCK_REALTIME,
469 icount_adjust_rt, NULL);
470 timer_mod(icount_rt_timer,
471 qemu_clock_get_ms(QEMU_CLOCK_REALTIME) + 1000);
472 icount_vm_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
473 icount_adjust_vm, NULL);
474 timer_mod(icount_vm_timer,
475 qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
476 get_ticks_per_sec() / 10);
479 /***********************************************************/
480 void hw_error(const char *fmt, ...)
482 va_list ap;
483 CPUState *cpu;
485 va_start(ap, fmt);
486 fprintf(stderr, "qemu: hardware error: ");
487 vfprintf(stderr, fmt, ap);
488 fprintf(stderr, "\n");
489 CPU_FOREACH(cpu) {
490 fprintf(stderr, "CPU #%d:\n", cpu->cpu_index);
491 cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_FPU);
493 va_end(ap);
494 abort();
497 void cpu_synchronize_all_states(void)
499 CPUState *cpu;
501 CPU_FOREACH(cpu) {
502 cpu_synchronize_state(cpu);
506 void cpu_synchronize_all_post_reset(void)
508 CPUState *cpu;
510 CPU_FOREACH(cpu) {
511 cpu_synchronize_post_reset(cpu);
515 void cpu_synchronize_all_post_init(void)
517 CPUState *cpu;
519 CPU_FOREACH(cpu) {
520 cpu_synchronize_post_init(cpu);
524 static int do_vm_stop(RunState state)
526 int ret = 0;
528 if (runstate_is_running()) {
529 cpu_disable_ticks();
530 pause_all_vcpus();
531 runstate_set(state);
532 vm_state_notify(0, state);
533 monitor_protocol_event(QEVENT_STOP, NULL);
536 bdrv_drain_all();
537 ret = bdrv_flush_all();
539 return ret;
542 static bool cpu_can_run(CPUState *cpu)
544 if (cpu->stop) {
545 return false;
547 if (cpu_is_stopped(cpu)) {
548 return false;
550 return true;
553 static void cpu_handle_guest_debug(CPUState *cpu)
555 gdb_set_stop_cpu(cpu);
556 qemu_system_debug_request();
557 cpu->stopped = true;
560 static void cpu_signal(int sig)
562 if (current_cpu) {
563 cpu_exit(current_cpu);
565 exit_request = 1;
568 #ifdef CONFIG_LINUX
569 static void sigbus_reraise(void)
571 sigset_t set;
572 struct sigaction action;
574 memset(&action, 0, sizeof(action));
575 action.sa_handler = SIG_DFL;
576 if (!sigaction(SIGBUS, &action, NULL)) {
577 raise(SIGBUS);
578 sigemptyset(&set);
579 sigaddset(&set, SIGBUS);
580 sigprocmask(SIG_UNBLOCK, &set, NULL);
582 perror("Failed to re-raise SIGBUS!\n");
583 abort();
586 static void sigbus_handler(int n, struct qemu_signalfd_siginfo *siginfo,
587 void *ctx)
589 if (kvm_on_sigbus(siginfo->ssi_code,
590 (void *)(intptr_t)siginfo->ssi_addr)) {
591 sigbus_reraise();
595 static void qemu_init_sigbus(void)
597 struct sigaction action;
599 memset(&action, 0, sizeof(action));
600 action.sa_flags = SA_SIGINFO;
601 action.sa_sigaction = (void (*)(int, siginfo_t*, void*))sigbus_handler;
602 sigaction(SIGBUS, &action, NULL);
604 prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0);
607 static void qemu_kvm_eat_signals(CPUState *cpu)
609 struct timespec ts = { 0, 0 };
610 siginfo_t siginfo;
611 sigset_t waitset;
612 sigset_t chkset;
613 int r;
615 sigemptyset(&waitset);
616 sigaddset(&waitset, SIG_IPI);
617 sigaddset(&waitset, SIGBUS);
619 do {
620 r = sigtimedwait(&waitset, &siginfo, &ts);
621 if (r == -1 && !(errno == EAGAIN || errno == EINTR)) {
622 perror("sigtimedwait");
623 exit(1);
626 switch (r) {
627 case SIGBUS:
628 if (kvm_on_sigbus_vcpu(cpu, siginfo.si_code, siginfo.si_addr)) {
629 sigbus_reraise();
631 break;
632 default:
633 break;
636 r = sigpending(&chkset);
637 if (r == -1) {
638 perror("sigpending");
639 exit(1);
641 } while (sigismember(&chkset, SIG_IPI) || sigismember(&chkset, SIGBUS));
644 #else /* !CONFIG_LINUX */
646 static void qemu_init_sigbus(void)
650 static void qemu_kvm_eat_signals(CPUState *cpu)
653 #endif /* !CONFIG_LINUX */
655 #ifndef _WIN32
656 static void dummy_signal(int sig)
660 static void qemu_kvm_init_cpu_signals(CPUState *cpu)
662 int r;
663 sigset_t set;
664 struct sigaction sigact;
666 memset(&sigact, 0, sizeof(sigact));
667 sigact.sa_handler = dummy_signal;
668 sigaction(SIG_IPI, &sigact, NULL);
670 pthread_sigmask(SIG_BLOCK, NULL, &set);
671 sigdelset(&set, SIG_IPI);
672 sigdelset(&set, SIGBUS);
673 r = kvm_set_signal_mask(cpu, &set);
674 if (r) {
675 fprintf(stderr, "kvm_set_signal_mask: %s\n", strerror(-r));
676 exit(1);
680 static void qemu_tcg_init_cpu_signals(void)
682 sigset_t set;
683 struct sigaction sigact;
685 memset(&sigact, 0, sizeof(sigact));
686 sigact.sa_handler = cpu_signal;
687 sigaction(SIG_IPI, &sigact, NULL);
689 sigemptyset(&set);
690 sigaddset(&set, SIG_IPI);
691 pthread_sigmask(SIG_UNBLOCK, &set, NULL);
694 #else /* _WIN32 */
695 static void qemu_kvm_init_cpu_signals(CPUState *cpu)
697 abort();
700 static void qemu_tcg_init_cpu_signals(void)
703 #endif /* _WIN32 */
705 static QemuMutex qemu_global_mutex;
706 static QemuCond qemu_io_proceeded_cond;
707 static bool iothread_requesting_mutex;
709 static QemuThread io_thread;
711 static QemuThread *tcg_cpu_thread;
712 static QemuCond *tcg_halt_cond;
714 /* cpu creation */
715 static QemuCond qemu_cpu_cond;
716 /* system init */
717 static QemuCond qemu_pause_cond;
718 static QemuCond qemu_work_cond;
720 void qemu_init_cpu_loop(void)
722 qemu_init_sigbus();
723 qemu_cond_init(&qemu_cpu_cond);
724 qemu_cond_init(&qemu_pause_cond);
725 qemu_cond_init(&qemu_work_cond);
726 qemu_cond_init(&qemu_io_proceeded_cond);
727 qemu_mutex_init(&qemu_global_mutex);
729 qemu_thread_get_self(&io_thread);
732 void run_on_cpu(CPUState *cpu, void (*func)(void *data), void *data)
734 struct qemu_work_item wi;
736 if (qemu_cpu_is_self(cpu)) {
737 func(data);
738 return;
741 wi.func = func;
742 wi.data = data;
743 wi.free = false;
744 if (cpu->queued_work_first == NULL) {
745 cpu->queued_work_first = &wi;
746 } else {
747 cpu->queued_work_last->next = &wi;
749 cpu->queued_work_last = &wi;
750 wi.next = NULL;
751 wi.done = false;
753 qemu_cpu_kick(cpu);
754 while (!wi.done) {
755 CPUState *self_cpu = current_cpu;
757 qemu_cond_wait(&qemu_work_cond, &qemu_global_mutex);
758 current_cpu = self_cpu;
762 void async_run_on_cpu(CPUState *cpu, void (*func)(void *data), void *data)
764 struct qemu_work_item *wi;
766 if (qemu_cpu_is_self(cpu)) {
767 func(data);
768 return;
771 wi = g_malloc0(sizeof(struct qemu_work_item));
772 wi->func = func;
773 wi->data = data;
774 wi->free = true;
775 if (cpu->queued_work_first == NULL) {
776 cpu->queued_work_first = wi;
777 } else {
778 cpu->queued_work_last->next = wi;
780 cpu->queued_work_last = wi;
781 wi->next = NULL;
782 wi->done = false;
784 qemu_cpu_kick(cpu);
787 static void flush_queued_work(CPUState *cpu)
789 struct qemu_work_item *wi;
791 if (cpu->queued_work_first == NULL) {
792 return;
795 while ((wi = cpu->queued_work_first)) {
796 cpu->queued_work_first = wi->next;
797 wi->func(wi->data);
798 wi->done = true;
799 if (wi->free) {
800 g_free(wi);
803 cpu->queued_work_last = NULL;
804 qemu_cond_broadcast(&qemu_work_cond);
807 static void qemu_wait_io_event_common(CPUState *cpu)
809 if (cpu->stop) {
810 cpu->stop = false;
811 cpu->stopped = true;
812 qemu_cond_signal(&qemu_pause_cond);
814 flush_queued_work(cpu);
815 cpu->thread_kicked = false;
818 static void qemu_tcg_wait_io_event(void)
820 CPUState *cpu;
822 while (all_cpu_threads_idle()) {
823 /* Start accounting real time to the virtual clock if the CPUs
824 are idle. */
825 qemu_clock_warp(QEMU_CLOCK_VIRTUAL);
826 qemu_cond_wait(tcg_halt_cond, &qemu_global_mutex);
829 while (iothread_requesting_mutex) {
830 qemu_cond_wait(&qemu_io_proceeded_cond, &qemu_global_mutex);
833 CPU_FOREACH(cpu) {
834 qemu_wait_io_event_common(cpu);
838 static void qemu_kvm_wait_io_event(CPUState *cpu)
840 while (cpu_thread_is_idle(cpu)) {
841 qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
844 qemu_kvm_eat_signals(cpu);
845 qemu_wait_io_event_common(cpu);
848 static void *qemu_kvm_cpu_thread_fn(void *arg)
850 CPUState *cpu = arg;
851 int r;
853 qemu_mutex_lock(&qemu_global_mutex);
854 qemu_thread_get_self(cpu->thread);
855 cpu->thread_id = qemu_get_thread_id();
856 current_cpu = cpu;
858 r = kvm_init_vcpu(cpu);
859 if (r < 0) {
860 fprintf(stderr, "kvm_init_vcpu failed: %s\n", strerror(-r));
861 exit(1);
864 qemu_kvm_init_cpu_signals(cpu);
866 /* signal CPU creation */
867 cpu->created = true;
868 qemu_cond_signal(&qemu_cpu_cond);
870 while (1) {
871 if (cpu_can_run(cpu)) {
872 r = kvm_cpu_exec(cpu);
873 if (r == EXCP_DEBUG) {
874 cpu_handle_guest_debug(cpu);
877 qemu_kvm_wait_io_event(cpu);
880 return NULL;
883 static void *qemu_dummy_cpu_thread_fn(void *arg)
885 #ifdef _WIN32
886 fprintf(stderr, "qtest is not supported under Windows\n");
887 exit(1);
888 #else
889 CPUState *cpu = arg;
890 sigset_t waitset;
891 int r;
893 qemu_mutex_lock_iothread();
894 qemu_thread_get_self(cpu->thread);
895 cpu->thread_id = qemu_get_thread_id();
897 sigemptyset(&waitset);
898 sigaddset(&waitset, SIG_IPI);
900 /* signal CPU creation */
901 cpu->created = true;
902 qemu_cond_signal(&qemu_cpu_cond);
904 current_cpu = cpu;
905 while (1) {
906 current_cpu = NULL;
907 qemu_mutex_unlock_iothread();
908 do {
909 int sig;
910 r = sigwait(&waitset, &sig);
911 } while (r == -1 && (errno == EAGAIN || errno == EINTR));
912 if (r == -1) {
913 perror("sigwait");
914 exit(1);
916 qemu_mutex_lock_iothread();
917 current_cpu = cpu;
918 qemu_wait_io_event_common(cpu);
921 return NULL;
922 #endif
925 static void tcg_exec_all(void);
927 static void *qemu_tcg_cpu_thread_fn(void *arg)
929 CPUState *cpu = arg;
931 qemu_tcg_init_cpu_signals();
932 qemu_thread_get_self(cpu->thread);
934 qemu_mutex_lock(&qemu_global_mutex);
935 CPU_FOREACH(cpu) {
936 cpu->thread_id = qemu_get_thread_id();
937 cpu->created = true;
939 qemu_cond_signal(&qemu_cpu_cond);
941 /* wait for initial kick-off after machine start */
942 while (QTAILQ_FIRST(&cpus)->stopped) {
943 qemu_cond_wait(tcg_halt_cond, &qemu_global_mutex);
945 /* process any pending work */
946 CPU_FOREACH(cpu) {
947 qemu_wait_io_event_common(cpu);
951 while (1) {
952 tcg_exec_all();
954 if (use_icount) {
955 int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
957 if (deadline == 0) {
958 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
961 qemu_tcg_wait_io_event();
964 return NULL;
967 static void qemu_cpu_kick_thread(CPUState *cpu)
969 #ifndef _WIN32
970 int err;
972 err = pthread_kill(cpu->thread->thread, SIG_IPI);
973 if (err) {
974 fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
975 exit(1);
977 #else /* _WIN32 */
978 if (!qemu_cpu_is_self(cpu)) {
979 CONTEXT tcgContext;
981 if (SuspendThread(cpu->hThread) == (DWORD)-1) {
982 fprintf(stderr, "qemu:%s: GetLastError:%lu\n", __func__,
983 GetLastError());
984 exit(1);
987 /* On multi-core systems, we are not sure that the thread is actually
988 * suspended until we can get the context.
990 tcgContext.ContextFlags = CONTEXT_CONTROL;
991 while (GetThreadContext(cpu->hThread, &tcgContext) != 0) {
992 continue;
995 cpu_signal(0);
997 if (ResumeThread(cpu->hThread) == (DWORD)-1) {
998 fprintf(stderr, "qemu:%s: GetLastError:%lu\n", __func__,
999 GetLastError());
1000 exit(1);
1003 #endif
1006 void qemu_cpu_kick(CPUState *cpu)
1008 qemu_cond_broadcast(cpu->halt_cond);
1009 if (!tcg_enabled() && !cpu->thread_kicked) {
1010 qemu_cpu_kick_thread(cpu);
1011 cpu->thread_kicked = true;
1015 void qemu_cpu_kick_self(void)
1017 #ifndef _WIN32
1018 assert(current_cpu);
1020 if (!current_cpu->thread_kicked) {
1021 qemu_cpu_kick_thread(current_cpu);
1022 current_cpu->thread_kicked = true;
1024 #else
1025 abort();
1026 #endif
1029 bool qemu_cpu_is_self(CPUState *cpu)
1031 return qemu_thread_is_self(cpu->thread);
1034 static bool qemu_in_vcpu_thread(void)
1036 return current_cpu && qemu_cpu_is_self(current_cpu);
1039 void qemu_mutex_lock_iothread(void)
1041 if (!tcg_enabled()) {
1042 qemu_mutex_lock(&qemu_global_mutex);
1043 } else {
1044 iothread_requesting_mutex = true;
1045 if (qemu_mutex_trylock(&qemu_global_mutex)) {
1046 qemu_cpu_kick_thread(first_cpu);
1047 qemu_mutex_lock(&qemu_global_mutex);
1049 iothread_requesting_mutex = false;
1050 qemu_cond_broadcast(&qemu_io_proceeded_cond);
1054 void qemu_mutex_unlock_iothread(void)
1056 qemu_mutex_unlock(&qemu_global_mutex);
1059 static int all_vcpus_paused(void)
1061 CPUState *cpu;
1063 CPU_FOREACH(cpu) {
1064 if (!cpu->stopped) {
1065 return 0;
1069 return 1;
1072 void pause_all_vcpus(void)
1074 CPUState *cpu;
1076 qemu_clock_enable(QEMU_CLOCK_VIRTUAL, false);
1077 CPU_FOREACH(cpu) {
1078 cpu->stop = true;
1079 qemu_cpu_kick(cpu);
1082 if (qemu_in_vcpu_thread()) {
1083 cpu_stop_current();
1084 if (!kvm_enabled()) {
1085 CPU_FOREACH(cpu) {
1086 cpu->stop = false;
1087 cpu->stopped = true;
1089 return;
1093 while (!all_vcpus_paused()) {
1094 qemu_cond_wait(&qemu_pause_cond, &qemu_global_mutex);
1095 CPU_FOREACH(cpu) {
1096 qemu_cpu_kick(cpu);
1101 void cpu_resume(CPUState *cpu)
1103 cpu->stop = false;
1104 cpu->stopped = false;
1105 qemu_cpu_kick(cpu);
1108 void resume_all_vcpus(void)
1110 CPUState *cpu;
1112 qemu_clock_enable(QEMU_CLOCK_VIRTUAL, true);
1113 CPU_FOREACH(cpu) {
1114 cpu_resume(cpu);
1118 /* For temporary buffers for forming a name */
1119 #define VCPU_THREAD_NAME_SIZE 16
1121 static void qemu_tcg_init_vcpu(CPUState *cpu)
1123 char thread_name[VCPU_THREAD_NAME_SIZE];
1125 tcg_cpu_address_space_init(cpu, cpu->as);
1127 /* share a single thread for all cpus with TCG */
1128 if (!tcg_cpu_thread) {
1129 cpu->thread = g_malloc0(sizeof(QemuThread));
1130 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1131 qemu_cond_init(cpu->halt_cond);
1132 tcg_halt_cond = cpu->halt_cond;
1133 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
1134 cpu->cpu_index);
1135 qemu_thread_create(cpu->thread, thread_name, qemu_tcg_cpu_thread_fn,
1136 cpu, QEMU_THREAD_JOINABLE);
1137 #ifdef _WIN32
1138 cpu->hThread = qemu_thread_get_handle(cpu->thread);
1139 #endif
1140 while (!cpu->created) {
1141 qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1143 tcg_cpu_thread = cpu->thread;
1144 } else {
1145 cpu->thread = tcg_cpu_thread;
1146 cpu->halt_cond = tcg_halt_cond;
1150 static void qemu_kvm_start_vcpu(CPUState *cpu)
1152 char thread_name[VCPU_THREAD_NAME_SIZE];
1154 cpu->thread = g_malloc0(sizeof(QemuThread));
1155 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1156 qemu_cond_init(cpu->halt_cond);
1157 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/KVM",
1158 cpu->cpu_index);
1159 qemu_thread_create(cpu->thread, thread_name, qemu_kvm_cpu_thread_fn,
1160 cpu, QEMU_THREAD_JOINABLE);
1161 while (!cpu->created) {
1162 qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1166 static void qemu_dummy_start_vcpu(CPUState *cpu)
1168 char thread_name[VCPU_THREAD_NAME_SIZE];
1170 cpu->thread = g_malloc0(sizeof(QemuThread));
1171 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1172 qemu_cond_init(cpu->halt_cond);
1173 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/DUMMY",
1174 cpu->cpu_index);
1175 qemu_thread_create(cpu->thread, thread_name, qemu_dummy_cpu_thread_fn, cpu,
1176 QEMU_THREAD_JOINABLE);
1177 while (!cpu->created) {
1178 qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1182 void qemu_init_vcpu(CPUState *cpu)
1184 cpu->nr_cores = smp_cores;
1185 cpu->nr_threads = smp_threads;
1186 cpu->stopped = true;
1187 if (kvm_enabled()) {
1188 qemu_kvm_start_vcpu(cpu);
1189 } else if (tcg_enabled()) {
1190 qemu_tcg_init_vcpu(cpu);
1191 } else {
1192 qemu_dummy_start_vcpu(cpu);
1196 void cpu_stop_current(void)
1198 if (current_cpu) {
1199 current_cpu->stop = false;
1200 current_cpu->stopped = true;
1201 cpu_exit(current_cpu);
1202 qemu_cond_signal(&qemu_pause_cond);
1206 int vm_stop(RunState state)
1208 if (qemu_in_vcpu_thread()) {
1209 qemu_system_vmstop_request(state);
1211 * FIXME: should not return to device code in case
1212 * vm_stop() has been requested.
1214 cpu_stop_current();
1215 return 0;
1218 return do_vm_stop(state);
1221 /* does a state transition even if the VM is already stopped,
1222 current state is forgotten forever */
1223 int vm_stop_force_state(RunState state)
1225 if (runstate_is_running()) {
1226 return vm_stop(state);
1227 } else {
1228 runstate_set(state);
1229 /* Make sure to return an error if the flush in a previous vm_stop()
1230 * failed. */
1231 return bdrv_flush_all();
1235 static int tcg_cpu_exec(CPUArchState *env)
1237 CPUState *cpu = ENV_GET_CPU(env);
1238 int ret;
1239 #ifdef CONFIG_PROFILER
1240 int64_t ti;
1241 #endif
1243 #ifdef CONFIG_PROFILER
1244 ti = profile_getclock();
1245 #endif
1246 if (use_icount) {
1247 int64_t count;
1248 int64_t deadline;
1249 int decr;
1250 qemu_icount -= (cpu->icount_decr.u16.low + cpu->icount_extra);
1251 cpu->icount_decr.u16.low = 0;
1252 cpu->icount_extra = 0;
1253 deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1255 /* Maintain prior (possibly buggy) behaviour where if no deadline
1256 * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
1257 * INT32_MAX nanoseconds ahead, we still use INT32_MAX
1258 * nanoseconds.
1260 if ((deadline < 0) || (deadline > INT32_MAX)) {
1261 deadline = INT32_MAX;
1264 count = qemu_icount_round(deadline);
1265 qemu_icount += count;
1266 decr = (count > 0xffff) ? 0xffff : count;
1267 count -= decr;
1268 cpu->icount_decr.u16.low = decr;
1269 cpu->icount_extra = count;
1271 ret = cpu_exec(env);
1272 #ifdef CONFIG_PROFILER
1273 qemu_time += profile_getclock() - ti;
1274 #endif
1275 if (use_icount) {
1276 /* Fold pending instructions back into the
1277 instruction counter, and clear the interrupt flag. */
1278 qemu_icount -= (cpu->icount_decr.u16.low + cpu->icount_extra);
1279 cpu->icount_decr.u32 = 0;
1280 cpu->icount_extra = 0;
1282 return ret;
1285 static void tcg_exec_all(void)
1287 int r;
1289 /* Account partial waits to QEMU_CLOCK_VIRTUAL. */
1290 qemu_clock_warp(QEMU_CLOCK_VIRTUAL);
1292 if (next_cpu == NULL) {
1293 next_cpu = first_cpu;
1295 for (; next_cpu != NULL && !exit_request; next_cpu = CPU_NEXT(next_cpu)) {
1296 CPUState *cpu = next_cpu;
1297 CPUArchState *env = cpu->env_ptr;
1299 qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
1300 (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
1302 if (cpu_can_run(cpu)) {
1303 r = tcg_cpu_exec(env);
1304 if (r == EXCP_DEBUG) {
1305 cpu_handle_guest_debug(cpu);
1306 break;
1308 } else if (cpu->stop || cpu->stopped) {
1309 break;
1312 exit_request = 0;
1315 void set_numa_modes(void)
1317 CPUState *cpu;
1318 int i;
1320 CPU_FOREACH(cpu) {
1321 for (i = 0; i < nb_numa_nodes; i++) {
1322 if (test_bit(cpu->cpu_index, node_cpumask[i])) {
1323 cpu->numa_node = i;
1329 void list_cpus(FILE *f, fprintf_function cpu_fprintf, const char *optarg)
1331 /* XXX: implement xxx_cpu_list for targets that still miss it */
1332 #if defined(cpu_list)
1333 cpu_list(f, cpu_fprintf);
1334 #endif
1337 CpuInfoList *qmp_query_cpus(Error **errp)
1339 CpuInfoList *head = NULL, *cur_item = NULL;
1340 CPUState *cpu;
1342 CPU_FOREACH(cpu) {
1343 CpuInfoList *info;
1344 #if defined(TARGET_I386)
1345 X86CPU *x86_cpu = X86_CPU(cpu);
1346 CPUX86State *env = &x86_cpu->env;
1347 #elif defined(TARGET_PPC)
1348 PowerPCCPU *ppc_cpu = POWERPC_CPU(cpu);
1349 CPUPPCState *env = &ppc_cpu->env;
1350 #elif defined(TARGET_SPARC)
1351 SPARCCPU *sparc_cpu = SPARC_CPU(cpu);
1352 CPUSPARCState *env = &sparc_cpu->env;
1353 #elif defined(TARGET_MIPS)
1354 MIPSCPU *mips_cpu = MIPS_CPU(cpu);
1355 CPUMIPSState *env = &mips_cpu->env;
1356 #endif
1358 cpu_synchronize_state(cpu);
1360 info = g_malloc0(sizeof(*info));
1361 info->value = g_malloc0(sizeof(*info->value));
1362 info->value->CPU = cpu->cpu_index;
1363 info->value->current = (cpu == first_cpu);
1364 info->value->halted = cpu->halted;
1365 info->value->thread_id = cpu->thread_id;
1366 #if defined(TARGET_I386)
1367 info->value->has_pc = true;
1368 info->value->pc = env->eip + env->segs[R_CS].base;
1369 #elif defined(TARGET_PPC)
1370 info->value->has_nip = true;
1371 info->value->nip = env->nip;
1372 #elif defined(TARGET_SPARC)
1373 info->value->has_pc = true;
1374 info->value->pc = env->pc;
1375 info->value->has_npc = true;
1376 info->value->npc = env->npc;
1377 #elif defined(TARGET_MIPS)
1378 info->value->has_PC = true;
1379 info->value->PC = env->active_tc.PC;
1380 #endif
1382 /* XXX: waiting for the qapi to support GSList */
1383 if (!cur_item) {
1384 head = cur_item = info;
1385 } else {
1386 cur_item->next = info;
1387 cur_item = info;
1391 return head;
1394 void qmp_memsave(int64_t addr, int64_t size, const char *filename,
1395 bool has_cpu, int64_t cpu_index, Error **errp)
1397 FILE *f;
1398 uint32_t l;
1399 CPUState *cpu;
1400 uint8_t buf[1024];
1402 if (!has_cpu) {
1403 cpu_index = 0;
1406 cpu = qemu_get_cpu(cpu_index);
1407 if (cpu == NULL) {
1408 error_set(errp, QERR_INVALID_PARAMETER_VALUE, "cpu-index",
1409 "a CPU number");
1410 return;
1413 f = fopen(filename, "wb");
1414 if (!f) {
1415 error_setg_file_open(errp, errno, filename);
1416 return;
1419 while (size != 0) {
1420 l = sizeof(buf);
1421 if (l > size)
1422 l = size;
1423 if (cpu_memory_rw_debug(cpu, addr, buf, l, 0) != 0) {
1424 error_setg(errp, "Invalid addr 0x%016" PRIx64 "specified", addr);
1425 goto exit;
1427 if (fwrite(buf, 1, l, f) != l) {
1428 error_set(errp, QERR_IO_ERROR);
1429 goto exit;
1431 addr += l;
1432 size -= l;
1435 exit:
1436 fclose(f);
1439 void qmp_pmemsave(int64_t addr, int64_t size, const char *filename,
1440 Error **errp)
1442 FILE *f;
1443 uint32_t l;
1444 uint8_t buf[1024];
1446 f = fopen(filename, "wb");
1447 if (!f) {
1448 error_setg_file_open(errp, errno, filename);
1449 return;
1452 while (size != 0) {
1453 l = sizeof(buf);
1454 if (l > size)
1455 l = size;
1456 cpu_physical_memory_read(addr, buf, l);
1457 if (fwrite(buf, 1, l, f) != l) {
1458 error_set(errp, QERR_IO_ERROR);
1459 goto exit;
1461 addr += l;
1462 size -= l;
1465 exit:
1466 fclose(f);
1469 void qmp_inject_nmi(Error **errp)
1471 #if defined(TARGET_I386)
1472 CPUState *cs;
1474 CPU_FOREACH(cs) {
1475 X86CPU *cpu = X86_CPU(cs);
1477 if (!cpu->apic_state) {
1478 cpu_interrupt(cs, CPU_INTERRUPT_NMI);
1479 } else {
1480 apic_deliver_nmi(cpu->apic_state);
1483 #elif defined(TARGET_S390X)
1484 CPUState *cs;
1485 S390CPU *cpu;
1487 CPU_FOREACH(cs) {
1488 cpu = S390_CPU(cs);
1489 if (cpu->env.cpu_num == monitor_get_cpu_index()) {
1490 if (s390_cpu_restart(S390_CPU(cs)) == -1) {
1491 error_set(errp, QERR_UNSUPPORTED);
1492 return;
1494 break;
1497 #else
1498 error_set(errp, QERR_UNSUPPORTED);
1499 #endif