cpus.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a copy
   7  * of this software and associated documentation files (the "Software"), to deal
   8  * in the Software without restriction, including without limitation the rights
   9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10  * copies of the Software, and to permit persons to whom the Software is
  11  * furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in
  14  * all copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22  * THE SOFTWARE.
  23  */
  24
  25 #include "qemu/osdep.h"
  26 #include "qemu/config-file.h"
  27 #include "cpu.h"
  28 #include "monitor/monitor.h"
  29 #include "qapi/error.h"
  30 #include "qapi/qapi-commands-misc.h"
  31 #include "qapi/qapi-events-run-state.h"
  32 #include "qapi/qmp/qerror.h"
  33 #include "qemu/error-report.h"
  34 #include "sysemu/sysemu.h"
  35 #include "sysemu/block-backend.h"
  36 #include "exec/gdbstub.h"
  37 #include "sysemu/dma.h"
  38 #include "sysemu/hw_accel.h"
  39 #include "sysemu/kvm.h"
  40 #include "sysemu/hax.h"
  41 #include "sysemu/hvf.h"
  42 #include "sysemu/whpx.h"
  43 #include "exec/exec-all.h"
  44
  45 #include "qemu/thread.h"
  46 #include "sysemu/cpus.h"
  47 #include "sysemu/qtest.h"
  48 #include "qemu/main-loop.h"
  49 #include "qemu/option.h"
  50 #include "qemu/bitmap.h"
  51 #include "qemu/seqlock.h"
  52 #include "tcg.h"
  53 #include "hw/nmi.h"
  54 #include "sysemu/replay.h"
  55 #include "hw/boards.h"
  56
  57 #ifdef CONFIG_LINUX
  58
  59 #include <sys/prctl.h>
  60
  61 #ifndef PR_MCE_KILL
  62 #define PR_MCE_KILL 33
  63 #endif
  64
  65 #ifndef PR_MCE_KILL_SET
  66 #define PR_MCE_KILL_SET 1
  67 #endif
  68
  69 #ifndef PR_MCE_KILL_EARLY
  70 #define PR_MCE_KILL_EARLY 1
  71 #endif
  72
  73 #endif /* CONFIG_LINUX */
  74
  75 int64_t max_delay;
  76 int64_t max_advance;
  77
  78 /* vcpu throttling controls */
  79 static QEMUTimer *throttle_timer;
  80 static unsigned int throttle_percentage;
  81
  82 #define CPU_THROTTLE_PCT_MIN 1
  83 #define CPU_THROTTLE_PCT_MAX 99
  84 #define CPU_THROTTLE_TIMESLICE_NS 10000000
  85
  86 bool cpu_is_stopped(CPUState *cpu)
  87 {
  88     return cpu->stopped || !runstate_is_running();
  89 }
  90
  91 static bool cpu_thread_is_idle(CPUState *cpu)
  92 {
  93     if (cpu->stop || cpu->queued_work_first) {
  94         return false;
  95     }
  96     if (cpu_is_stopped(cpu)) {
  97         return true;
  98     }
  99     if (!cpu->halted || cpu_has_work(cpu) ||
 100         kvm_halt_in_kernel()) {
 101         return false;
 102     }
 103     return true;
 104 }
 105
 106 static bool all_cpu_threads_idle(void)
 107 {
 108     CPUState *cpu;
 109
 110     CPU_FOREACH(cpu) {
 111         if (!cpu_thread_is_idle(cpu)) {
 112             return false;
 113         }
 114     }
 115     return true;
 116 }
 117
 118 /***********************************************************/
 119 /* guest cycle counter */
 120
 121 /* Protected by TimersState seqlock */
 122
 123 static bool icount_sleep = true;
 124 /* Conversion factor from emulated instructions to virtual clock ticks.  */
 125 static int icount_time_shift;
 126 /* Arbitrarily pick 1MIPS as the minimum allowable speed.  */
 127 #define MAX_ICOUNT_SHIFT 10
 128
 129 typedef struct TimersState {
 130     /* Protected by BQL.  */
 131     int64_t cpu_ticks_prev;
 132     int64_t cpu_ticks_offset;
 133
 134     /* cpu_clock_offset can be read out of BQL, so protect it with
 135      * this lock.
 136      */
 137     QemuSeqLock vm_clock_seqlock;
 138     int64_t cpu_clock_offset;
 139     int32_t cpu_ticks_enabled;
 140     int64_t dummy;
 141
 142     /* Compensate for varying guest execution speed.  */
 143     int64_t qemu_icount_bias;
 144     /* Only written by TCG thread */
 145     int64_t qemu_icount;
 146     /* for adjusting icount */
 147     int64_t vm_clock_warp_start;
 148     QEMUTimer *icount_rt_timer;
 149     QEMUTimer *icount_vm_timer;
 150     QEMUTimer *icount_warp_timer;
 151 } TimersState;
 152
 153 static TimersState timers_state;
 154 bool mttcg_enabled;
 155
 156 /*
 157  * We default to false if we know other options have been enabled
 158  * which are currently incompatible with MTTCG. Otherwise when each
 159  * guest (target) has been updated to support:
 160  *   - atomic instructions
 161  *   - memory ordering primitives (barriers)
 162  * they can set the appropriate CONFIG flags in ${target}-softmmu.mak
 163  *
 164  * Once a guest architecture has been converted to the new primitives
 165  * there are two remaining limitations to check.
 166  *
 167  * - The guest can't be oversized (e.g. 64 bit guest on 32 bit host)
 168  * - The host must have a stronger memory order than the guest
 169  *
 170  * It may be possible in future to support strong guests on weak hosts
 171  * but that will require tagging all load/stores in a guest with their
 172  * implicit memory order requirements which would likely slow things
 173  * down a lot.
 174  */
 175
 176 static bool check_tcg_memory_orders_compatible(void)
 177 {
 178 #if defined(TCG_GUEST_DEFAULT_MO) && defined(TCG_TARGET_DEFAULT_MO)
 179     return (TCG_GUEST_DEFAULT_MO & ~TCG_TARGET_DEFAULT_MO) == 0;
 180 #else
 181     return false;
 182 #endif
 183 }
 184
 185 static bool default_mttcg_enabled(void)
 186 {
 187     if (use_icount || TCG_OVERSIZED_GUEST) {
 188         return false;
 189     } else {
 190 #ifdef TARGET_SUPPORTS_MTTCG
 191         return check_tcg_memory_orders_compatible();
 192 #else
 193         return false;
 194 #endif
 195     }
 196 }
 197
 198 void qemu_tcg_configure(QemuOpts *opts, Error **errp)
 199 {
 200     const char *t = qemu_opt_get(opts, "thread");
 201     if (t) {
 202         if (strcmp(t, "multi") == 0) {
 203             if (TCG_OVERSIZED_GUEST) {
 204                 error_setg(errp, "No MTTCG when guest word size > hosts");
 205             } else if (use_icount) {
 206                 error_setg(errp, "No MTTCG when icount is enabled");
 207             } else {
 208 #ifndef TARGET_SUPPORTS_MTTCG
 209                 error_report("Guest not yet converted to MTTCG - "
 210                              "you may get unexpected results");
 211 #endif
 212                 if (!check_tcg_memory_orders_compatible()) {
 213                     error_report("Guest expects a stronger memory ordering "
 214                                  "than the host provides");
 215                     error_printf("This may cause strange/hard to debug errors\n");
 216                 }
 217                 mttcg_enabled = true;
 218             }
 219         } else if (strcmp(t, "single") == 0) {
 220             mttcg_enabled = false;
 221         } else {
 222             error_setg(errp, "Invalid 'thread' setting %s", t);
 223         }
 224     } else {
 225         mttcg_enabled = default_mttcg_enabled();
 226     }
 227 }
 228
 229 /* The current number of executed instructions is based on what we
 230  * originally budgeted minus the current state of the decrementing
 231  * icount counters in extra/u16.low.
 232  */
 233 static int64_t cpu_get_icount_executed(CPUState *cpu)
 234 {
 235     return cpu->icount_budget - (cpu->icount_decr.u16.low + cpu->icount_extra);
 236 }
 237
 238 /*
 239  * Update the global shared timer_state.qemu_icount to take into
 240  * account executed instructions. This is done by the TCG vCPU
 241  * thread so the main-loop can see time has moved forward.
 242  */
 243 void cpu_update_icount(CPUState *cpu)
 244 {
 245     int64_t executed = cpu_get_icount_executed(cpu);
 246     cpu->icount_budget -= executed;
 247
 248 #ifdef CONFIG_ATOMIC64
 249     atomic_set__nocheck(&timers_state.qemu_icount,
 250                         atomic_read__nocheck(&timers_state.qemu_icount) +
 251                         executed);
 252 #else /* FIXME: we need 64bit atomics to do this safely */
 253     timers_state.qemu_icount += executed;
 254 #endif
 255 }
 256
 257 int64_t cpu_get_icount_raw(void)
 258 {
 259     CPUState *cpu = current_cpu;
 260
 261     if (cpu && cpu->running) {
 262         if (!cpu->can_do_io) {
 263             error_report("Bad icount read");
 264             exit(1);
 265         }
 266         /* Take into account what has run */
 267         cpu_update_icount(cpu);
 268     }
 269 #ifdef CONFIG_ATOMIC64
 270     return atomic_read__nocheck(&timers_state.qemu_icount);
 271 #else /* FIXME: we need 64bit atomics to do this safely */
 272     return timers_state.qemu_icount;
 273 #endif
 274 }
 275
 276 /* Return the virtual CPU time, based on the instruction counter.  */
 277 static int64_t cpu_get_icount_locked(void)
 278 {
 279     int64_t icount = cpu_get_icount_raw();
 280     return timers_state.qemu_icount_bias + cpu_icount_to_ns(icount);
 281 }
 282
 283 int64_t cpu_get_icount(void)
 284 {
 285     int64_t icount;
 286     unsigned start;
 287
 288     do {
 289         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 290         icount = cpu_get_icount_locked();
 291     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 292
 293     return icount;
 294 }
 295
 296 int64_t cpu_icount_to_ns(int64_t icount)
 297 {
 298     return icount << icount_time_shift;
 299 }
 300
 301 /* return the time elapsed in VM between vm_start and vm_stop.  Unless
 302  * icount is active, cpu_get_ticks() uses units of the host CPU cycle
 303  * counter.
 304  *
 305  * Caller must hold the BQL
 306  */
 307 int64_t cpu_get_ticks(void)
 308 {
 309     int64_t ticks;
 310
 311     if (use_icount) {
 312         return cpu_get_icount();
 313     }
 314
 315     ticks = timers_state.cpu_ticks_offset;
 316     if (timers_state.cpu_ticks_enabled) {
 317         ticks += cpu_get_host_ticks();
 318     }
 319
 320     if (timers_state.cpu_ticks_prev > ticks) {
 321         /* Note: non increasing ticks may happen if the host uses
 322            software suspend */
 323         timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks;
 324         ticks = timers_state.cpu_ticks_prev;
 325     }
 326
 327     timers_state.cpu_ticks_prev = ticks;
 328     return ticks;
 329 }
 330
 331 static int64_t cpu_get_clock_locked(void)
 332 {
 333     int64_t time;
 334
 335     time = timers_state.cpu_clock_offset;
 336     if (timers_state.cpu_ticks_enabled) {
 337         time += get_clock();
 338     }
 339
 340     return time;
 341 }
 342
 343 /* Return the monotonic time elapsed in VM, i.e.,
 344  * the time between vm_start and vm_stop
 345  */
 346 int64_t cpu_get_clock(void)
 347 {
 348     int64_t ti;
 349     unsigned start;
 350
 351     do {
 352         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 353         ti = cpu_get_clock_locked();
 354     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 355
 356     return ti;
 357 }
 358
 359 /* enable cpu_get_ticks()
 360  * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
 361  */
 362 void cpu_enable_ticks(void)
 363 {
 364     /* Here, the really thing protected by seqlock is cpu_clock_offset. */
 365     seqlock_write_begin(&timers_state.vm_clock_seqlock);
 366     if (!timers_state.cpu_ticks_enabled) {
 367         timers_state.cpu_ticks_offset -= cpu_get_host_ticks();
 368         timers_state.cpu_clock_offset -= get_clock();
 369         timers_state.cpu_ticks_enabled = 1;
 370     }
 371     seqlock_write_end(&timers_state.vm_clock_seqlock);
 372 }
 373
 374 /* disable cpu_get_ticks() : the clock is stopped. You must not call
 375  * cpu_get_ticks() after that.
 376  * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
 377  */
 378 void cpu_disable_ticks(void)
 379 {
 380     /* Here, the really thing protected by seqlock is cpu_clock_offset. */
 381     seqlock_write_begin(&timers_state.vm_clock_seqlock);
 382     if (timers_state.cpu_ticks_enabled) {
 383         timers_state.cpu_ticks_offset += cpu_get_host_ticks();
 384         timers_state.cpu_clock_offset = cpu_get_clock_locked();
 385         timers_state.cpu_ticks_enabled = 0;
 386     }
 387     seqlock_write_end(&timers_state.vm_clock_seqlock);
 388 }
 389
 390 /* Correlation between real and virtual time is always going to be
 391    fairly approximate, so ignore small variation.
 392    When the guest is idle real and virtual time will be aligned in
 393    the IO wait loop.  */
 394 #define ICOUNT_WOBBLE (NANOSECONDS_PER_SECOND / 10)
 395
 396 static void icount_adjust(void)
 397 {
 398     int64_t cur_time;
 399     int64_t cur_icount;
 400     int64_t delta;
 401
 402     /* Protected by TimersState mutex.  */
 403     static int64_t last_delta;
 404
 405     /* If the VM is not running, then do nothing.  */
 406     if (!runstate_is_running()) {
 407         return;
 408     }
 409
 410     seqlock_write_begin(&timers_state.vm_clock_seqlock);
 411     cur_time = cpu_get_clock_locked();
 412     cur_icount = cpu_get_icount_locked();
 413
 414     delta = cur_icount - cur_time;
 415     /* FIXME: This is a very crude algorithm, somewhat prone to oscillation.  */
 416     if (delta > 0
 417         && last_delta + ICOUNT_WOBBLE < delta * 2
 418         && icount_time_shift > 0) {
 419         /* The guest is getting too far ahead.  Slow time down.  */
 420         icount_time_shift--;
 421     }
 422     if (delta < 0
 423         && last_delta - ICOUNT_WOBBLE > delta * 2
 424         && icount_time_shift < MAX_ICOUNT_SHIFT) {
 425         /* The guest is getting too far behind.  Speed time up.  */
 426         icount_time_shift++;
 427     }
 428     last_delta = delta;
 429     timers_state.qemu_icount_bias = cur_icount
 430                               - (timers_state.qemu_icount << icount_time_shift);
 431     seqlock_write_end(&timers_state.vm_clock_seqlock);
 432 }
 433
 434 static void icount_adjust_rt(void *opaque)
 435 {
 436     timer_mod(timers_state.icount_rt_timer,
 437               qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
 438     icount_adjust();
 439 }
 440
 441 static void icount_adjust_vm(void *opaque)
 442 {
 443     timer_mod(timers_state.icount_vm_timer,
 444                    qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 445                    NANOSECONDS_PER_SECOND / 10);
 446     icount_adjust();
 447 }
 448
 449 static int64_t qemu_icount_round(int64_t count)
 450 {
 451     return (count + (1 << icount_time_shift) - 1) >> icount_time_shift;
 452 }
 453
 454 static void icount_warp_rt(void)
 455 {
 456     unsigned seq;
 457     int64_t warp_start;
 458
 459     /* The icount_warp_timer is rescheduled soon after vm_clock_warp_start
 460      * changes from -1 to another value, so the race here is okay.
 461      */
 462     do {
 463         seq = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 464         warp_start = timers_state.vm_clock_warp_start;
 465     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, seq));
 466
 467     if (warp_start == -1) {
 468         return;
 469     }
 470
 471     seqlock_write_begin(&timers_state.vm_clock_seqlock);
 472     if (runstate_is_running()) {
 473         int64_t clock = REPLAY_CLOCK(REPLAY_CLOCK_VIRTUAL_RT,
 474                                      cpu_get_clock_locked());
 475         int64_t warp_delta;
 476
 477         warp_delta = clock - timers_state.vm_clock_warp_start;
 478         if (use_icount == 2) {
 479             /*
 480              * In adaptive mode, do not let QEMU_CLOCK_VIRTUAL run too
 481              * far ahead of real time.
 482              */
 483             int64_t cur_icount = cpu_get_icount_locked();
 484             int64_t delta = clock - cur_icount;
 485             warp_delta = MIN(warp_delta, delta);
 486         }
 487         timers_state.qemu_icount_bias += warp_delta;
 488     }
 489     timers_state.vm_clock_warp_start = -1;
 490     seqlock_write_end(&timers_state.vm_clock_seqlock);
 491
 492     if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL)) {
 493         qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 494     }
 495 }
 496
 497 static void icount_timer_cb(void *opaque)
 498 {
 499     /* No need for a checkpoint because the timer already synchronizes
 500      * with CHECKPOINT_CLOCK_VIRTUAL_RT.
 501      */
 502     icount_warp_rt();
 503 }
 504
 505 void qtest_clock_warp(int64_t dest)
 506 {
 507     int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 508     AioContext *aio_context;
 509     assert(qtest_enabled());
 510     aio_context = qemu_get_aio_context();
 511     while (clock < dest) {
 512         int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
 513         int64_t warp = qemu_soonest_timeout(dest - clock, deadline);
 514
 515         seqlock_write_begin(&timers_state.vm_clock_seqlock);
 516         timers_state.qemu_icount_bias += warp;
 517         seqlock_write_end(&timers_state.vm_clock_seqlock);
 518
 519         qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
 520         timerlist_run_timers(aio_context->tlg.tl[QEMU_CLOCK_VIRTUAL]);
 521         clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 522     }
 523     qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 524 }
 525
 526 void qemu_start_warp_timer(void)
 527 {
 528     int64_t clock;
 529     int64_t deadline;
 530
 531     if (!use_icount) {
 532         return;
 533     }
 534
 535     /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
 536      * do not fire, so computing the deadline does not make sense.
 537      */
 538     if (!runstate_is_running()) {
 539         return;
 540     }
 541
 542     /* warp clock deterministically in record/replay mode */
 543     if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_START)) {
 544         return;
 545     }
 546
 547     if (!all_cpu_threads_idle()) {
 548         return;
 549     }
 550
 551     if (qtest_enabled()) {
 552         /* When testing, qtest commands advance icount.  */
 553         return;
 554     }
 555
 556     /* We want to use the earliest deadline from ALL vm_clocks */
 557     clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT);
 558     deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
 559     if (deadline < 0) {
 560         static bool notified;
 561         if (!icount_sleep && !notified) {
 562             warn_report("icount sleep disabled and no active timers");
 563             notified = true;
 564         }
 565         return;
 566     }
 567
 568     if (deadline > 0) {
 569         /*
 570          * Ensure QEMU_CLOCK_VIRTUAL proceeds even when the virtual CPU goes to
 571          * sleep.  Otherwise, the CPU might be waiting for a future timer
 572          * interrupt to wake it up, but the interrupt never comes because
 573          * the vCPU isn't running any insns and thus doesn't advance the
 574          * QEMU_CLOCK_VIRTUAL.
 575          */
 576         if (!icount_sleep) {
 577             /*
 578              * We never let VCPUs sleep in no sleep icount mode.
 579              * If there is a pending QEMU_CLOCK_VIRTUAL timer we just advance
 580              * to the next QEMU_CLOCK_VIRTUAL event and notify it.
 581              * It is useful when we want a deterministic execution time,
 582              * isolated from host latencies.
 583              */
 584             seqlock_write_begin(&timers_state.vm_clock_seqlock);
 585             timers_state.qemu_icount_bias += deadline;
 586             seqlock_write_end(&timers_state.vm_clock_seqlock);
 587             qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 588         } else {
 589             /*
 590              * We do stop VCPUs and only advance QEMU_CLOCK_VIRTUAL after some
 591              * "real" time, (related to the time left until the next event) has
 592              * passed. The QEMU_CLOCK_VIRTUAL_RT clock will do this.
 593              * This avoids that the warps are visible externally; for example,
 594              * you will not be sending network packets continuously instead of
 595              * every 100ms.
 596              */
 597             seqlock_write_begin(&timers_state.vm_clock_seqlock);
 598             if (timers_state.vm_clock_warp_start == -1
 599                 || timers_state.vm_clock_warp_start > clock) {
 600                 timers_state.vm_clock_warp_start = clock;
 601             }
 602             seqlock_write_end(&timers_state.vm_clock_seqlock);
 603             timer_mod_anticipate(timers_state.icount_warp_timer,
 604                                  clock + deadline);
 605         }
 606     } else if (deadline == 0) {
 607         qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 608     }
 609 }
 610
 611 static void qemu_account_warp_timer(void)
 612 {
 613     if (!use_icount || !icount_sleep) {
 614         return;
 615     }
 616
 617     /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
 618      * do not fire, so computing the deadline does not make sense.
 619      */
 620     if (!runstate_is_running()) {
 621         return;
 622     }
 623
 624     /* warp clock deterministically in record/replay mode */
 625     if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_ACCOUNT)) {
 626         return;
 627     }
 628
 629     timer_del(timers_state.icount_warp_timer);
 630     icount_warp_rt();
 631 }
 632
 633 static bool icount_state_needed(void *opaque)
 634 {
 635     return use_icount;
 636 }
 637
 638 static bool warp_timer_state_needed(void *opaque)
 639 {
 640     TimersState *s = opaque;
 641     return s->icount_warp_timer != NULL;
 642 }
 643
 644 static bool adjust_timers_state_needed(void *opaque)
 645 {
 646     TimersState *s = opaque;
 647     return s->icount_rt_timer != NULL;
 648 }
 649
 650 /*
 651  * Subsection for warp timer migration is optional, because may not be created
 652  */
 653 static const VMStateDescription icount_vmstate_warp_timer = {
 654     .name = "timer/icount/warp_timer",
 655     .version_id = 1,
 656     .minimum_version_id = 1,
 657     .needed = warp_timer_state_needed,
 658     .fields = (VMStateField[]) {
 659         VMSTATE_INT64(vm_clock_warp_start, TimersState),
 660         VMSTATE_TIMER_PTR(icount_warp_timer, TimersState),
 661         VMSTATE_END_OF_LIST()
 662     }
 663 };
 664
 665 static const VMStateDescription icount_vmstate_adjust_timers = {
 666     .name = "timer/icount/timers",
 667     .version_id = 1,
 668     .minimum_version_id = 1,
 669     .needed = adjust_timers_state_needed,
 670     .fields = (VMStateField[]) {
 671         VMSTATE_TIMER_PTR(icount_rt_timer, TimersState),
 672         VMSTATE_TIMER_PTR(icount_vm_timer, TimersState),
 673         VMSTATE_END_OF_LIST()
 674     }
 675 };
 676
 677 /*
 678  * This is a subsection for icount migration.
 679  */
 680 static const VMStateDescription icount_vmstate_timers = {
 681     .name = "timer/icount",
 682     .version_id = 1,
 683     .minimum_version_id = 1,
 684     .needed = icount_state_needed,
 685     .fields = (VMStateField[]) {
 686         VMSTATE_INT64(qemu_icount_bias, TimersState),
 687         VMSTATE_INT64(qemu_icount, TimersState),
 688         VMSTATE_END_OF_LIST()
 689     },
 690     .subsections = (const VMStateDescription*[]) {
 691         &icount_vmstate_warp_timer,
 692         &icount_vmstate_adjust_timers,
 693         NULL
 694     }
 695 };
 696
 697 static const VMStateDescription vmstate_timers = {
 698     .name = "timer",
 699     .version_id = 2,
 700     .minimum_version_id = 1,
 701     .fields = (VMStateField[]) {
 702         VMSTATE_INT64(cpu_ticks_offset, TimersState),
 703         VMSTATE_INT64(dummy, TimersState),
 704         VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2),
 705         VMSTATE_END_OF_LIST()
 706     },
 707     .subsections = (const VMStateDescription*[]) {
 708         &icount_vmstate_timers,
 709         NULL
 710     }
 711 };
 712
 713 static void cpu_throttle_thread(CPUState *cpu, run_on_cpu_data opaque)
 714 {
 715     double pct;
 716     double throttle_ratio;
 717     long sleeptime_ns;
 718
 719     if (!cpu_throttle_get_percentage()) {
 720         return;
 721     }
 722
 723     pct = (double)cpu_throttle_get_percentage()/100;
 724     throttle_ratio = pct / (1 - pct);
 725     sleeptime_ns = (long)(throttle_ratio * CPU_THROTTLE_TIMESLICE_NS);
 726
 727     qemu_mutex_unlock_iothread();
 728     g_usleep(sleeptime_ns / 1000); /* Convert ns to us for usleep call */
 729     qemu_mutex_lock_iothread();
 730     atomic_set(&cpu->throttle_thread_scheduled, 0);
 731 }
 732
 733 static void cpu_throttle_timer_tick(void *opaque)
 734 {
 735     CPUState *cpu;
 736     double pct;
 737
 738     /* Stop the timer if needed */
 739     if (!cpu_throttle_get_percentage()) {
 740         return;
 741     }
 742     CPU_FOREACH(cpu) {
 743         if (!atomic_xchg(&cpu->throttle_thread_scheduled, 1)) {
 744             async_run_on_cpu(cpu, cpu_throttle_thread,
 745                              RUN_ON_CPU_NULL);
 746         }
 747     }
 748
 749     pct = (double)cpu_throttle_get_percentage()/100;
 750     timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
 751                                    CPU_THROTTLE_TIMESLICE_NS / (1-pct));
 752 }
 753
 754 void cpu_throttle_set(int new_throttle_pct)
 755 {
 756     /* Ensure throttle percentage is within valid range */
 757     new_throttle_pct = MIN(new_throttle_pct, CPU_THROTTLE_PCT_MAX);
 758     new_throttle_pct = MAX(new_throttle_pct, CPU_THROTTLE_PCT_MIN);
 759
 760     atomic_set(&throttle_percentage, new_throttle_pct);
 761
 762     timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
 763                                        CPU_THROTTLE_TIMESLICE_NS);
 764 }
 765
 766 void cpu_throttle_stop(void)
 767 {
 768     atomic_set(&throttle_percentage, 0);
 769 }
 770
 771 bool cpu_throttle_active(void)
 772 {
 773     return (cpu_throttle_get_percentage() != 0);
 774 }
 775
 776 int cpu_throttle_get_percentage(void)
 777 {
 778     return atomic_read(&throttle_percentage);
 779 }
 780
 781 void cpu_ticks_init(void)
 782 {
 783     seqlock_init(&timers_state.vm_clock_seqlock);
 784     vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
 785     throttle_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
 786                                            cpu_throttle_timer_tick, NULL);
 787 }
 788
 789 void configure_icount(QemuOpts *opts, Error **errp)
 790 {
 791     const char *option;
 792     char *rem_str = NULL;
 793
 794     option = qemu_opt_get(opts, "shift");
 795     if (!option) {
 796         if (qemu_opt_get(opts, "align") != NULL) {
 797             error_setg(errp, "Please specify shift option when using align");
 798         }
 799         return;
 800     }
 801
 802     icount_sleep = qemu_opt_get_bool(opts, "sleep", true);
 803     if (icount_sleep) {
 804         timers_state.icount_warp_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
 805                                          icount_timer_cb, NULL);
 806     }
 807
 808     icount_align_option = qemu_opt_get_bool(opts, "align", false);
 809
 810     if (icount_align_option && !icount_sleep) {
 811         error_setg(errp, "align=on and sleep=off are incompatible");
 812     }
 813     if (strcmp(option, "auto") != 0) {
 814         errno = 0;
 815         icount_time_shift = strtol(option, &rem_str, 0);
 816         if (errno != 0 || *rem_str != '\0' || !strlen(option)) {
 817             error_setg(errp, "icount: Invalid shift value");
 818         }
 819         use_icount = 1;
 820         return;
 821     } else if (icount_align_option) {
 822         error_setg(errp, "shift=auto and align=on are incompatible");
 823     } else if (!icount_sleep) {
 824         error_setg(errp, "shift=auto and sleep=off are incompatible");
 825     }
 826
 827     use_icount = 2;
 828
 829     /* 125MIPS seems a reasonable initial guess at the guest speed.
 830        It will be corrected fairly quickly anyway.  */
 831     icount_time_shift = 3;
 832
 833     /* Have both realtime and virtual time triggers for speed adjustment.
 834        The realtime trigger catches emulated time passing too slowly,
 835        the virtual time trigger catches emulated time passing too fast.
 836        Realtime triggers occur even when idle, so use them less frequently
 837        than VM triggers.  */
 838     timers_state.vm_clock_warp_start = -1;
 839     timers_state.icount_rt_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL_RT,
 840                                    icount_adjust_rt, NULL);
 841     timer_mod(timers_state.icount_rt_timer,
 842                    qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
 843     timers_state.icount_vm_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 844                                         icount_adjust_vm, NULL);
 845     timer_mod(timers_state.icount_vm_timer,
 846                    qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 847                    NANOSECONDS_PER_SECOND / 10);
 848 }
 849
 850 /***********************************************************/
 851 /* TCG vCPU kick timer
 852  *
 853  * The kick timer is responsible for moving single threaded vCPU
 854  * emulation on to the next vCPU. If more than one vCPU is running a
 855  * timer event with force a cpu->exit so the next vCPU can get
 856  * scheduled.
 857  *
 858  * The timer is removed if all vCPUs are idle and restarted again once
 859  * idleness is complete.
 860  */
 861
 862 static QEMUTimer *tcg_kick_vcpu_timer;
 863 static CPUState *tcg_current_rr_cpu;
 864
 865 #define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
 866
 867 static inline int64_t qemu_tcg_next_kick(void)
 868 {
 869     return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + TCG_KICK_PERIOD;
 870 }
 871
 872 /* Kick the currently round-robin scheduled vCPU */
 873 static void qemu_cpu_kick_rr_cpu(void)
 874 {
 875     CPUState *cpu;
 876     do {
 877         cpu = atomic_mb_read(&tcg_current_rr_cpu);
 878         if (cpu) {
 879             cpu_exit(cpu);
 880         }
 881     } while (cpu != atomic_mb_read(&tcg_current_rr_cpu));
 882 }
 883
 884 static void do_nothing(CPUState *cpu, run_on_cpu_data unused)
 885 {
 886 }
 887
 888 void qemu_timer_notify_cb(void *opaque, QEMUClockType type)
 889 {
 890     if (!use_icount || type != QEMU_CLOCK_VIRTUAL) {
 891         qemu_notify_event();
 892         return;
 893     }
 894
 895     if (!qemu_in_vcpu_thread() && first_cpu) {
 896         /* qemu_cpu_kick is not enough to kick a halted CPU out of
 897          * qemu_tcg_wait_io_event.  async_run_on_cpu, instead,
 898          * causes cpu_thread_is_idle to return false.  This way,
 899          * handle_icount_deadline can run.
 900          */
 901         async_run_on_cpu(first_cpu, do_nothing, RUN_ON_CPU_NULL);
 902     }
 903 }
 904
 905 static void kick_tcg_thread(void *opaque)
 906 {
 907     timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 908     qemu_cpu_kick_rr_cpu();
 909 }
 910
 911 static void start_tcg_kick_timer(void)
 912 {
 913     assert(!mttcg_enabled);
 914     if (!tcg_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
 915         tcg_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 916                                            kick_tcg_thread, NULL);
 917         timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 918     }
 919 }
 920
 921 static void stop_tcg_kick_timer(void)
 922 {
 923     assert(!mttcg_enabled);
 924     if (tcg_kick_vcpu_timer) {
 925         timer_del(tcg_kick_vcpu_timer);
 926         tcg_kick_vcpu_timer = NULL;
 927     }
 928 }
 929
 930 /***********************************************************/
 931 void hw_error(const char *fmt, ...)
 932 {
 933     va_list ap;
 934     CPUState *cpu;
 935
 936     va_start(ap, fmt);
 937     fprintf(stderr, "qemu: hardware error: ");
 938     vfprintf(stderr, fmt, ap);
 939     fprintf(stderr, "\n");
 940     CPU_FOREACH(cpu) {
 941         fprintf(stderr, "CPU #%d:\n", cpu->cpu_index);
 942         cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_FPU);
 943     }
 944     va_end(ap);
 945     abort();
 946 }
 947
 948 void cpu_synchronize_all_states(void)
 949 {
 950     CPUState *cpu;
 951
 952     CPU_FOREACH(cpu) {
 953         cpu_synchronize_state(cpu);
 954         /* TODO: move to cpu_synchronize_state() */
 955         if (hvf_enabled()) {
 956             hvf_cpu_synchronize_state(cpu);
 957         }
 958     }
 959 }
 960
 961 void cpu_synchronize_all_post_reset(void)
 962 {
 963     CPUState *cpu;
 964
 965     CPU_FOREACH(cpu) {
 966         cpu_synchronize_post_reset(cpu);
 967         /* TODO: move to cpu_synchronize_post_reset() */
 968         if (hvf_enabled()) {
 969             hvf_cpu_synchronize_post_reset(cpu);
 970         }
 971     }
 972 }
 973
 974 void cpu_synchronize_all_post_init(void)
 975 {
 976     CPUState *cpu;
 977
 978     CPU_FOREACH(cpu) {
 979         cpu_synchronize_post_init(cpu);
 980         /* TODO: move to cpu_synchronize_post_init() */
 981         if (hvf_enabled()) {
 982             hvf_cpu_synchronize_post_init(cpu);
 983         }
 984     }
 985 }
 986
 987 void cpu_synchronize_all_pre_loadvm(void)
 988 {
 989     CPUState *cpu;
 990
 991     CPU_FOREACH(cpu) {
 992         cpu_synchronize_pre_loadvm(cpu);
 993     }
 994 }
 995
 996 static int do_vm_stop(RunState state, bool send_stop)
 997 {
 998     int ret = 0;
 999
1000     if (runstate_is_running()) {
1001         cpu_disable_ticks();
1002         pause_all_vcpus();
1003         runstate_set(state);
1004         vm_state_notify(0, state);
1005         if (send_stop) {
1006             qapi_event_send_stop(&error_abort);
1007         }
1008     }
1009
1010     bdrv_drain_all();
1011     replay_disable_events();
1012     ret = bdrv_flush_all();
1013
1014     return ret;
1015 }
1016
1017 /* Special vm_stop() variant for terminating the process.  Historically clients
1018  * did not expect a QMP STOP event and so we need to retain compatibility.
1019  */
1020 int vm_shutdown(void)
1021 {
1022     return do_vm_stop(RUN_STATE_SHUTDOWN, false);
1023 }
1024
1025 static bool cpu_can_run(CPUState *cpu)
1026 {
1027     if (cpu->stop) {
1028         return false;
1029     }
1030     if (cpu_is_stopped(cpu)) {
1031         return false;
1032     }
1033     return true;
1034 }
1035
1036 static void cpu_handle_guest_debug(CPUState *cpu)
1037 {
1038     gdb_set_stop_cpu(cpu);
1039     qemu_system_debug_request();
1040     cpu->stopped = true;
1041 }
1042
1043 #ifdef CONFIG_LINUX
1044 static void sigbus_reraise(void)
1045 {
1046     sigset_t set;
1047     struct sigaction action;
1048
1049     memset(&action, 0, sizeof(action));
1050     action.sa_handler = SIG_DFL;
1051     if (!sigaction(SIGBUS, &action, NULL)) {
1052         raise(SIGBUS);
1053         sigemptyset(&set);
1054         sigaddset(&set, SIGBUS);
1055         pthread_sigmask(SIG_UNBLOCK, &set, NULL);
1056     }
1057     perror("Failed to re-raise SIGBUS!\n");
1058     abort();
1059 }
1060
1061 static void sigbus_handler(int n, siginfo_t *siginfo, void *ctx)
1062 {
1063     if (siginfo->si_code != BUS_MCEERR_AO && siginfo->si_code != BUS_MCEERR_AR) {
1064         sigbus_reraise();
1065     }
1066
1067     if (current_cpu) {
1068         /* Called asynchronously in VCPU thread.  */
1069         if (kvm_on_sigbus_vcpu(current_cpu, siginfo->si_code, siginfo->si_addr)) {
1070             sigbus_reraise();
1071         }
1072     } else {
1073         /* Called synchronously (via signalfd) in main thread.  */
1074         if (kvm_on_sigbus(siginfo->si_code, siginfo->si_addr)) {
1075             sigbus_reraise();
1076         }
1077     }
1078 }
1079
1080 static void qemu_init_sigbus(void)
1081 {
1082     struct sigaction action;
1083
1084     memset(&action, 0, sizeof(action));
1085     action.sa_flags = SA_SIGINFO;
1086     action.sa_sigaction = sigbus_handler;
1087     sigaction(SIGBUS, &action, NULL);
1088
1089     prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0);
1090 }
1091 #else /* !CONFIG_LINUX */
1092 static void qemu_init_sigbus(void)
1093 {
1094 }
1095 #endif /* !CONFIG_LINUX */
1096
1097 static QemuMutex qemu_global_mutex;
1098
1099 static QemuThread io_thread;
1100
1101 /* cpu creation */
1102 static QemuCond qemu_cpu_cond;
1103 /* system init */
1104 static QemuCond qemu_pause_cond;
1105
1106 void qemu_init_cpu_loop(void)
1107 {
1108     qemu_init_sigbus();
1109     qemu_cond_init(&qemu_cpu_cond);
1110     qemu_cond_init(&qemu_pause_cond);
1111     qemu_mutex_init(&qemu_global_mutex);
1112
1113     qemu_thread_get_self(&io_thread);
1114 }
1115
1116 void run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data)
1117 {
1118     do_run_on_cpu(cpu, func, data, &qemu_global_mutex);
1119 }
1120
1121 static void qemu_kvm_destroy_vcpu(CPUState *cpu)
1122 {
1123     if (kvm_destroy_vcpu(cpu) < 0) {
1124         error_report("kvm_destroy_vcpu failed");
1125         exit(EXIT_FAILURE);
1126     }
1127 }
1128
1129 static void qemu_tcg_destroy_vcpu(CPUState *cpu)
1130 {
1131 }
1132
1133 static void qemu_cpu_stop(CPUState *cpu, bool exit)
1134 {
1135     g_assert(qemu_cpu_is_self(cpu));
1136     cpu->stop = false;
1137     cpu->stopped = true;
1138     if (exit) {
1139         cpu_exit(cpu);
1140     }
1141     qemu_cond_broadcast(&qemu_pause_cond);
1142 }
1143
1144 static void qemu_wait_io_event_common(CPUState *cpu)
1145 {
1146     atomic_mb_set(&cpu->thread_kicked, false);
1147     if (cpu->stop) {
1148         qemu_cpu_stop(cpu, false);
1149     }
1150     process_queued_cpu_work(cpu);
1151 }
1152
1153 static void qemu_tcg_rr_wait_io_event(CPUState *cpu)
1154 {
1155     while (all_cpu_threads_idle()) {
1156         stop_tcg_kick_timer();
1157         qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1158     }
1159
1160     start_tcg_kick_timer();
1161
1162     qemu_wait_io_event_common(cpu);
1163 }
1164
1165 static void qemu_wait_io_event(CPUState *cpu)
1166 {
1167     while (cpu_thread_is_idle(cpu)) {
1168         qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1169     }
1170
1171 #ifdef _WIN32
1172     /* Eat dummy APC queued by qemu_cpu_kick_thread.  */
1173     if (!tcg_enabled()) {
1174         SleepEx(0, TRUE);
1175     }
1176 #endif
1177     qemu_wait_io_event_common(cpu);
1178 }
1179
1180 static void *qemu_kvm_cpu_thread_fn(void *arg)
1181 {
1182     CPUState *cpu = arg;
1183     int r;
1184
1185     rcu_register_thread();
1186
1187     qemu_mutex_lock_iothread();
1188     qemu_thread_get_self(cpu->thread);
1189     cpu->thread_id = qemu_get_thread_id();
1190     cpu->can_do_io = 1;
1191     current_cpu = cpu;
1192
1193     r = kvm_init_vcpu(cpu);
1194     if (r < 0) {
1195         error_report("kvm_init_vcpu failed: %s", strerror(-r));
1196         exit(1);
1197     }
1198
1199     kvm_init_cpu_signals(cpu);
1200
1201     /* signal CPU creation */
1202     cpu->created = true;
1203     qemu_cond_signal(&qemu_cpu_cond);
1204
1205     do {
1206         if (cpu_can_run(cpu)) {
1207             r = kvm_cpu_exec(cpu);
1208             if (r == EXCP_DEBUG) {
1209                 cpu_handle_guest_debug(cpu);
1210             }
1211         }
1212         qemu_wait_io_event(cpu);
1213     } while (!cpu->unplug || cpu_can_run(cpu));
1214
1215     qemu_kvm_destroy_vcpu(cpu);
1216     cpu->created = false;
1217     qemu_cond_signal(&qemu_cpu_cond);
1218     qemu_mutex_unlock_iothread();
1219     rcu_unregister_thread();
1220     return NULL;
1221 }
1222
1223 static void *qemu_dummy_cpu_thread_fn(void *arg)
1224 {
1225 #ifdef _WIN32
1226     error_report("qtest is not supported under Windows");
1227     exit(1);
1228 #else
1229     CPUState *cpu = arg;
1230     sigset_t waitset;
1231     int r;
1232
1233     rcu_register_thread();
1234
1235     qemu_mutex_lock_iothread();
1236     qemu_thread_get_self(cpu->thread);
1237     cpu->thread_id = qemu_get_thread_id();
1238     cpu->can_do_io = 1;
1239     current_cpu = cpu;
1240
1241     sigemptyset(&waitset);
1242     sigaddset(&waitset, SIG_IPI);
1243
1244     /* signal CPU creation */
1245     cpu->created = true;
1246     qemu_cond_signal(&qemu_cpu_cond);
1247
1248     do {
1249         qemu_mutex_unlock_iothread();
1250         do {
1251             int sig;
1252             r = sigwait(&waitset, &sig);
1253         } while (r == -1 && (errno == EAGAIN || errno == EINTR));
1254         if (r == -1) {
1255             perror("sigwait");
1256             exit(1);
1257         }
1258         qemu_mutex_lock_iothread();
1259         qemu_wait_io_event(cpu);
1260     } while (!cpu->unplug);
1261
1262     rcu_unregister_thread();
1263     return NULL;
1264 #endif
1265 }
1266
1267 static int64_t tcg_get_icount_limit(void)
1268 {
1269     int64_t deadline;
1270
1271     if (replay_mode != REPLAY_MODE_PLAY) {
1272         deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1273
1274         /* Maintain prior (possibly buggy) behaviour where if no deadline
1275          * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
1276          * INT32_MAX nanoseconds ahead, we still use INT32_MAX
1277          * nanoseconds.
1278          */
1279         if ((deadline < 0) || (deadline > INT32_MAX)) {
1280             deadline = INT32_MAX;
1281         }
1282
1283         return qemu_icount_round(deadline);
1284     } else {
1285         return replay_get_instructions();
1286     }
1287 }
1288
1289 static void handle_icount_deadline(void)
1290 {
1291     assert(qemu_in_vcpu_thread());
1292     if (use_icount) {
1293         int64_t deadline =
1294             qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1295
1296         if (deadline == 0) {
1297             /* Wake up other AioContexts.  */
1298             qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
1299             qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
1300         }
1301     }
1302 }
1303
1304 static void prepare_icount_for_run(CPUState *cpu)
1305 {
1306     if (use_icount) {
1307         int insns_left;
1308
1309         /* These should always be cleared by process_icount_data after
1310          * each vCPU execution. However u16.high can be raised
1311          * asynchronously by cpu_exit/cpu_interrupt/tcg_handle_interrupt
1312          */
1313         g_assert(cpu->icount_decr.u16.low == 0);
1314         g_assert(cpu->icount_extra == 0);
1315
1316         cpu->icount_budget = tcg_get_icount_limit();
1317         insns_left = MIN(0xffff, cpu->icount_budget);
1318         cpu->icount_decr.u16.low = insns_left;
1319         cpu->icount_extra = cpu->icount_budget - insns_left;
1320
1321         replay_mutex_lock();
1322     }
1323 }
1324
1325 static void process_icount_data(CPUState *cpu)
1326 {
1327     if (use_icount) {
1328         /* Account for executed instructions */
1329         cpu_update_icount(cpu);
1330
1331         /* Reset the counters */
1332         cpu->icount_decr.u16.low = 0;
1333         cpu->icount_extra = 0;
1334         cpu->icount_budget = 0;
1335
1336         replay_account_executed_instructions();
1337
1338         replay_mutex_unlock();
1339     }
1340 }
1341
1342
1343 static int tcg_cpu_exec(CPUState *cpu)
1344 {
1345     int ret;
1346 #ifdef CONFIG_PROFILER
1347     int64_t ti;
1348 #endif
1349
1350 #ifdef CONFIG_PROFILER
1351     ti = profile_getclock();
1352 #endif
1353     cpu_exec_start(cpu);
1354     ret = cpu_exec(cpu);
1355     cpu_exec_end(cpu);
1356 #ifdef CONFIG_PROFILER
1357     tcg_time += profile_getclock() - ti;
1358 #endif
1359     return ret;
1360 }
1361
1362 /* Destroy any remaining vCPUs which have been unplugged and have
1363  * finished running
1364  */
1365 static void deal_with_unplugged_cpus(void)
1366 {
1367     CPUState *cpu;
1368
1369     CPU_FOREACH(cpu) {
1370         if (cpu->unplug && !cpu_can_run(cpu)) {
1371             qemu_tcg_destroy_vcpu(cpu);
1372             cpu->created = false;
1373             qemu_cond_signal(&qemu_cpu_cond);
1374             break;
1375         }
1376     }
1377 }
1378
1379 /* Single-threaded TCG
1380  *
1381  * In the single-threaded case each vCPU is simulated in turn. If
1382  * there is more than a single vCPU we create a simple timer to kick
1383  * the vCPU and ensure we don't get stuck in a tight loop in one vCPU.
1384  * This is done explicitly rather than relying on side-effects
1385  * elsewhere.
1386  */
1387
1388 static void *qemu_tcg_rr_cpu_thread_fn(void *arg)
1389 {
1390     CPUState *cpu = arg;
1391
1392     rcu_register_thread();
1393     tcg_register_thread();
1394
1395     qemu_mutex_lock_iothread();
1396     qemu_thread_get_self(cpu->thread);
1397
1398     cpu->thread_id = qemu_get_thread_id();
1399     cpu->created = true;
1400     cpu->can_do_io = 1;
1401     qemu_cond_signal(&qemu_cpu_cond);
1402
1403     /* wait for initial kick-off after machine start */
1404     while (first_cpu->stopped) {
1405         qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1406
1407         /* process any pending work */
1408         CPU_FOREACH(cpu) {
1409             current_cpu = cpu;
1410             qemu_wait_io_event_common(cpu);
1411         }
1412     }
1413
1414     start_tcg_kick_timer();
1415
1416     cpu = first_cpu;
1417
1418     /* process any pending work */
1419     cpu->exit_request = 1;
1420
1421     while (1) {
1422         qemu_mutex_unlock_iothread();
1423         replay_mutex_lock();
1424         qemu_mutex_lock_iothread();
1425         /* Account partial waits to QEMU_CLOCK_VIRTUAL.  */
1426         qemu_account_warp_timer();
1427
1428         /* Run the timers here.  This is much more efficient than
1429          * waking up the I/O thread and waiting for completion.
1430          */
1431         handle_icount_deadline();
1432
1433         replay_mutex_unlock();
1434
1435         if (!cpu) {
1436             cpu = first_cpu;
1437         }
1438
1439         while (cpu && !cpu->queued_work_first && !cpu->exit_request) {
1440
1441             atomic_mb_set(&tcg_current_rr_cpu, cpu);
1442             current_cpu = cpu;
1443
1444             qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
1445                               (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
1446
1447             if (cpu_can_run(cpu)) {
1448                 int r;
1449
1450                 qemu_mutex_unlock_iothread();
1451                 prepare_icount_for_run(cpu);
1452
1453                 r = tcg_cpu_exec(cpu);
1454
1455                 process_icount_data(cpu);
1456                 qemu_mutex_lock_iothread();
1457
1458                 if (r == EXCP_DEBUG) {
1459                     cpu_handle_guest_debug(cpu);
1460                     break;
1461                 } else if (r == EXCP_ATOMIC) {
1462                     qemu_mutex_unlock_iothread();
1463                     cpu_exec_step_atomic(cpu);
1464                     qemu_mutex_lock_iothread();
1465                     break;
1466                 }
1467             } else if (cpu->stop) {
1468                 if (cpu->unplug) {
1469                     cpu = CPU_NEXT(cpu);
1470                 }
1471                 break;
1472             }
1473
1474             cpu = CPU_NEXT(cpu);
1475         } /* while (cpu && !cpu->exit_request).. */
1476
1477         /* Does not need atomic_mb_set because a spurious wakeup is okay.  */
1478         atomic_set(&tcg_current_rr_cpu, NULL);
1479
1480         if (cpu && cpu->exit_request) {
1481             atomic_mb_set(&cpu->exit_request, 0);
1482         }
1483
1484         qemu_tcg_rr_wait_io_event(cpu ? cpu : QTAILQ_FIRST(&cpus));
1485         deal_with_unplugged_cpus();
1486     }
1487
1488     rcu_unregister_thread();
1489     return NULL;
1490 }
1491
1492 static void *qemu_hax_cpu_thread_fn(void *arg)
1493 {
1494     CPUState *cpu = arg;
1495     int r;
1496
1497     rcu_register_thread();
1498     qemu_mutex_lock_iothread();
1499     qemu_thread_get_self(cpu->thread);
1500
1501     cpu->thread_id = qemu_get_thread_id();
1502     cpu->created = true;
1503     cpu->halted = 0;
1504     current_cpu = cpu;
1505
1506     hax_init_vcpu(cpu);
1507     qemu_cond_signal(&qemu_cpu_cond);
1508
1509     do {
1510         if (cpu_can_run(cpu)) {
1511             r = hax_smp_cpu_exec(cpu);
1512             if (r == EXCP_DEBUG) {
1513                 cpu_handle_guest_debug(cpu);
1514             }
1515         }
1516
1517         qemu_wait_io_event(cpu);
1518     } while (!cpu->unplug || cpu_can_run(cpu));
1519     rcu_unregister_thread();
1520     return NULL;
1521 }
1522
1523 /* The HVF-specific vCPU thread function. This one should only run when the host
1524  * CPU supports the VMX "unrestricted guest" feature. */
1525 static void *qemu_hvf_cpu_thread_fn(void *arg)
1526 {
1527     CPUState *cpu = arg;
1528
1529     int r;
1530
1531     assert(hvf_enabled());
1532
1533     rcu_register_thread();
1534
1535     qemu_mutex_lock_iothread();
1536     qemu_thread_get_self(cpu->thread);
1537
1538     cpu->thread_id = qemu_get_thread_id();
1539     cpu->can_do_io = 1;
1540     current_cpu = cpu;
1541
1542     hvf_init_vcpu(cpu);
1543
1544     /* signal CPU creation */
1545     cpu->created = true;
1546     qemu_cond_signal(&qemu_cpu_cond);
1547
1548     do {
1549         if (cpu_can_run(cpu)) {
1550             r = hvf_vcpu_exec(cpu);
1551             if (r == EXCP_DEBUG) {
1552                 cpu_handle_guest_debug(cpu);
1553             }
1554         }
1555         qemu_wait_io_event(cpu);
1556     } while (!cpu->unplug || cpu_can_run(cpu));
1557
1558     hvf_vcpu_destroy(cpu);
1559     cpu->created = false;
1560     qemu_cond_signal(&qemu_cpu_cond);
1561     qemu_mutex_unlock_iothread();
1562     rcu_unregister_thread();
1563     return NULL;
1564 }
1565
1566 static void *qemu_whpx_cpu_thread_fn(void *arg)
1567 {
1568     CPUState *cpu = arg;
1569     int r;
1570
1571     rcu_register_thread();
1572
1573     qemu_mutex_lock_iothread();
1574     qemu_thread_get_self(cpu->thread);
1575     cpu->thread_id = qemu_get_thread_id();
1576     current_cpu = cpu;
1577
1578     r = whpx_init_vcpu(cpu);
1579     if (r < 0) {
1580         fprintf(stderr, "whpx_init_vcpu failed: %s\n", strerror(-r));
1581         exit(1);
1582     }
1583
1584     /* signal CPU creation */
1585     cpu->created = true;
1586     qemu_cond_signal(&qemu_cpu_cond);
1587
1588     do {
1589         if (cpu_can_run(cpu)) {
1590             r = whpx_vcpu_exec(cpu);
1591             if (r == EXCP_DEBUG) {
1592                 cpu_handle_guest_debug(cpu);
1593             }
1594         }
1595         while (cpu_thread_is_idle(cpu)) {
1596             qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1597         }
1598         qemu_wait_io_event_common(cpu);
1599     } while (!cpu->unplug || cpu_can_run(cpu));
1600
1601     whpx_destroy_vcpu(cpu);
1602     cpu->created = false;
1603     qemu_cond_signal(&qemu_cpu_cond);
1604     qemu_mutex_unlock_iothread();
1605     rcu_unregister_thread();
1606     return NULL;
1607 }
1608
1609 #ifdef _WIN32
1610 static void CALLBACK dummy_apc_func(ULONG_PTR unused)
1611 {
1612 }
1613 #endif
1614
1615 /* Multi-threaded TCG
1616  *
1617  * In the multi-threaded case each vCPU has its own thread. The TLS
1618  * variable current_cpu can be used deep in the code to find the
1619  * current CPUState for a given thread.
1620  */
1621
1622 static void *qemu_tcg_cpu_thread_fn(void *arg)
1623 {
1624     CPUState *cpu = arg;
1625
1626     g_assert(!use_icount);
1627
1628     rcu_register_thread();
1629     tcg_register_thread();
1630
1631     qemu_mutex_lock_iothread();
1632     qemu_thread_get_self(cpu->thread);
1633
1634     cpu->thread_id = qemu_get_thread_id();
1635     cpu->created = true;
1636     cpu->can_do_io = 1;
1637     current_cpu = cpu;
1638     qemu_cond_signal(&qemu_cpu_cond);
1639
1640     /* process any pending work */
1641     cpu->exit_request = 1;
1642
1643     while (1) {
1644         if (cpu_can_run(cpu)) {
1645             int r;
1646             qemu_mutex_unlock_iothread();
1647             r = tcg_cpu_exec(cpu);
1648             qemu_mutex_lock_iothread();
1649             switch (r) {
1650             case EXCP_DEBUG:
1651                 cpu_handle_guest_debug(cpu);
1652                 break;
1653             case EXCP_HALTED:
1654                 /* during start-up the vCPU is reset and the thread is
1655                  * kicked several times. If we don't ensure we go back
1656                  * to sleep in the halted state we won't cleanly
1657                  * start-up when the vCPU is enabled.
1658                  *
1659                  * cpu->halted should ensure we sleep in wait_io_event
1660                  */
1661                 g_assert(cpu->halted);
1662                 break;
1663             case EXCP_ATOMIC:
1664                 qemu_mutex_unlock_iothread();
1665                 cpu_exec_step_atomic(cpu);
1666                 qemu_mutex_lock_iothread();
1667             default:
1668                 /* Ignore everything else? */
1669                 break;
1670             }
1671         }
1672
1673         atomic_mb_set(&cpu->exit_request, 0);
1674         qemu_wait_io_event(cpu);
1675     } while (!cpu->unplug || cpu_can_run(cpu));
1676
1677     qemu_tcg_destroy_vcpu(cpu);
1678     cpu->created = false;
1679     qemu_cond_signal(&qemu_cpu_cond);
1680     qemu_mutex_unlock_iothread();
1681     rcu_unregister_thread();
1682     return NULL;
1683 }
1684
1685 static void qemu_cpu_kick_thread(CPUState *cpu)
1686 {
1687 #ifndef _WIN32
1688     int err;
1689
1690     if (cpu->thread_kicked) {
1691         return;
1692     }
1693     cpu->thread_kicked = true;
1694     err = pthread_kill(cpu->thread->thread, SIG_IPI);
1695     if (err) {
1696         fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
1697         exit(1);
1698     }
1699 #else /* _WIN32 */
1700     if (!qemu_cpu_is_self(cpu)) {
1701         if (whpx_enabled()) {
1702             whpx_vcpu_kick(cpu);
1703         } else if (!QueueUserAPC(dummy_apc_func, cpu->hThread, 0)) {
1704             fprintf(stderr, "%s: QueueUserAPC failed with error %lu\n",
1705                     __func__, GetLastError());
1706             exit(1);
1707         }
1708     }
1709 #endif
1710 }
1711
1712 void qemu_cpu_kick(CPUState *cpu)
1713 {
1714     qemu_cond_broadcast(cpu->halt_cond);
1715     if (tcg_enabled()) {
1716         cpu_exit(cpu);
1717         /* NOP unless doing single-thread RR */
1718         qemu_cpu_kick_rr_cpu();
1719     } else {
1720         if (hax_enabled()) {
1721             /*
1722              * FIXME: race condition with the exit_request check in
1723              * hax_vcpu_hax_exec
1724              */
1725             cpu->exit_request = 1;
1726         }
1727         qemu_cpu_kick_thread(cpu);
1728     }
1729 }
1730
1731 void qemu_cpu_kick_self(void)
1732 {
1733     assert(current_cpu);
1734     qemu_cpu_kick_thread(current_cpu);
1735 }
1736
1737 bool qemu_cpu_is_self(CPUState *cpu)
1738 {
1739     return qemu_thread_is_self(cpu->thread);
1740 }
1741
1742 bool qemu_in_vcpu_thread(void)
1743 {
1744     return current_cpu && qemu_cpu_is_self(current_cpu);
1745 }
1746
1747 static __thread bool iothread_locked = false;
1748
1749 bool qemu_mutex_iothread_locked(void)
1750 {
1751     return iothread_locked;
1752 }
1753
1754 void qemu_mutex_lock_iothread(void)
1755 {
1756     g_assert(!qemu_mutex_iothread_locked());
1757     qemu_mutex_lock(&qemu_global_mutex);
1758     iothread_locked = true;
1759 }
1760
1761 void qemu_mutex_unlock_iothread(void)
1762 {
1763     g_assert(qemu_mutex_iothread_locked());
1764     iothread_locked = false;
1765     qemu_mutex_unlock(&qemu_global_mutex);
1766 }
1767
1768 static bool all_vcpus_paused(void)
1769 {
1770     CPUState *cpu;
1771
1772     CPU_FOREACH(cpu) {
1773         if (!cpu->stopped) {
1774             return false;
1775         }
1776     }
1777
1778     return true;
1779 }
1780
1781 void pause_all_vcpus(void)
1782 {
1783     CPUState *cpu;
1784
1785     qemu_clock_enable(QEMU_CLOCK_VIRTUAL, false);
1786     CPU_FOREACH(cpu) {
1787         if (qemu_cpu_is_self(cpu)) {
1788             qemu_cpu_stop(cpu, true);
1789         } else {
1790             cpu->stop = true;
1791             qemu_cpu_kick(cpu);
1792         }
1793     }
1794
1795     /* We need to drop the replay_lock so any vCPU threads woken up
1796      * can finish their replay tasks
1797      */
1798     replay_mutex_unlock();
1799
1800     while (!all_vcpus_paused()) {
1801         qemu_cond_wait(&qemu_pause_cond, &qemu_global_mutex);
1802         CPU_FOREACH(cpu) {
1803             qemu_cpu_kick(cpu);
1804         }
1805     }
1806
1807     qemu_mutex_unlock_iothread();
1808     replay_mutex_lock();
1809     qemu_mutex_lock_iothread();
1810 }
1811
1812 void cpu_resume(CPUState *cpu)
1813 {
1814     cpu->stop = false;
1815     cpu->stopped = false;
1816     qemu_cpu_kick(cpu);
1817 }
1818
1819 void resume_all_vcpus(void)
1820 {
1821     CPUState *cpu;
1822
1823     qemu_clock_enable(QEMU_CLOCK_VIRTUAL, true);
1824     CPU_FOREACH(cpu) {
1825         cpu_resume(cpu);
1826     }
1827 }
1828
1829 void cpu_remove_sync(CPUState *cpu)
1830 {
1831     cpu->stop = true;
1832     cpu->unplug = true;
1833     qemu_cpu_kick(cpu);
1834     qemu_mutex_unlock_iothread();
1835     qemu_thread_join(cpu->thread);
1836     qemu_mutex_lock_iothread();
1837 }
1838
1839 /* For temporary buffers for forming a name */
1840 #define VCPU_THREAD_NAME_SIZE 16
1841
1842 static void qemu_tcg_init_vcpu(CPUState *cpu)
1843 {
1844     char thread_name[VCPU_THREAD_NAME_SIZE];
1845     static QemuCond *single_tcg_halt_cond;
1846     static QemuThread *single_tcg_cpu_thread;
1847     static int tcg_region_inited;
1848
1849     /*
1850      * Initialize TCG regions--once. Now is a good time, because:
1851      * (1) TCG's init context, prologue and target globals have been set up.
1852      * (2) qemu_tcg_mttcg_enabled() works now (TCG init code runs before the
1853      *     -accel flag is processed, so the check doesn't work then).
1854      */
1855     if (!tcg_region_inited) {
1856         tcg_region_inited = 1;
1857         tcg_region_init();
1858     }
1859
1860     if (qemu_tcg_mttcg_enabled() || !single_tcg_cpu_thread) {
1861         cpu->thread = g_malloc0(sizeof(QemuThread));
1862         cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1863         qemu_cond_init(cpu->halt_cond);
1864
1865         if (qemu_tcg_mttcg_enabled()) {
1866             /* create a thread per vCPU with TCG (MTTCG) */
1867             parallel_cpus = true;
1868             snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
1869                  cpu->cpu_index);
1870
1871             qemu_thread_create(cpu->thread, thread_name, qemu_tcg_cpu_thread_fn,
1872                                cpu, QEMU_THREAD_JOINABLE);
1873
1874         } else {
1875             /* share a single thread for all cpus with TCG */
1876             snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "ALL CPUs/TCG");
1877             qemu_thread_create(cpu->thread, thread_name,
1878                                qemu_tcg_rr_cpu_thread_fn,
1879                                cpu, QEMU_THREAD_JOINABLE);
1880
1881             single_tcg_halt_cond = cpu->halt_cond;
1882             single_tcg_cpu_thread = cpu->thread;
1883         }
1884 #ifdef _WIN32
1885         cpu->hThread = qemu_thread_get_handle(cpu->thread);
1886 #endif
1887     } else {
1888         /* For non-MTTCG cases we share the thread */
1889         cpu->thread = single_tcg_cpu_thread;
1890         cpu->halt_cond = single_tcg_halt_cond;
1891         cpu->thread_id = first_cpu->thread_id;
1892         cpu->can_do_io = 1;
1893         cpu->created = true;
1894     }
1895 }
1896
1897 static void qemu_hax_start_vcpu(CPUState *cpu)
1898 {
1899     char thread_name[VCPU_THREAD_NAME_SIZE];
1900
1901     cpu->thread = g_malloc0(sizeof(QemuThread));
1902     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1903     qemu_cond_init(cpu->halt_cond);
1904
1905     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HAX",
1906              cpu->cpu_index);
1907     qemu_thread_create(cpu->thread, thread_name, qemu_hax_cpu_thread_fn,
1908                        cpu, QEMU_THREAD_JOINABLE);
1909 #ifdef _WIN32
1910     cpu->hThread = qemu_thread_get_handle(cpu->thread);
1911 #endif
1912 }
1913
1914 static void qemu_kvm_start_vcpu(CPUState *cpu)
1915 {
1916     char thread_name[VCPU_THREAD_NAME_SIZE];
1917
1918     cpu->thread = g_malloc0(sizeof(QemuThread));
1919     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1920     qemu_cond_init(cpu->halt_cond);
1921     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/KVM",
1922              cpu->cpu_index);
1923     qemu_thread_create(cpu->thread, thread_name, qemu_kvm_cpu_thread_fn,
1924                        cpu, QEMU_THREAD_JOINABLE);
1925 }
1926
1927 static void qemu_hvf_start_vcpu(CPUState *cpu)
1928 {
1929     char thread_name[VCPU_THREAD_NAME_SIZE];
1930
1931     /* HVF currently does not support TCG, and only runs in
1932      * unrestricted-guest mode. */
1933     assert(hvf_enabled());
1934
1935     cpu->thread = g_malloc0(sizeof(QemuThread));
1936     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1937     qemu_cond_init(cpu->halt_cond);
1938
1939     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HVF",
1940              cpu->cpu_index);
1941     qemu_thread_create(cpu->thread, thread_name, qemu_hvf_cpu_thread_fn,
1942                        cpu, QEMU_THREAD_JOINABLE);
1943 }
1944
1945 static void qemu_whpx_start_vcpu(CPUState *cpu)
1946 {
1947     char thread_name[VCPU_THREAD_NAME_SIZE];
1948
1949     cpu->thread = g_malloc0(sizeof(QemuThread));
1950     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1951     qemu_cond_init(cpu->halt_cond);
1952     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/WHPX",
1953              cpu->cpu_index);
1954     qemu_thread_create(cpu->thread, thread_name, qemu_whpx_cpu_thread_fn,
1955                        cpu, QEMU_THREAD_JOINABLE);
1956 #ifdef _WIN32
1957     cpu->hThread = qemu_thread_get_handle(cpu->thread);
1958 #endif
1959 }
1960
1961 static void qemu_dummy_start_vcpu(CPUState *cpu)
1962 {
1963     char thread_name[VCPU_THREAD_NAME_SIZE];
1964
1965     cpu->thread = g_malloc0(sizeof(QemuThread));
1966     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1967     qemu_cond_init(cpu->halt_cond);
1968     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/DUMMY",
1969              cpu->cpu_index);
1970     qemu_thread_create(cpu->thread, thread_name, qemu_dummy_cpu_thread_fn, cpu,
1971                        QEMU_THREAD_JOINABLE);
1972 }
1973
1974 void qemu_init_vcpu(CPUState *cpu)
1975 {
1976     cpu->nr_cores = smp_cores;
1977     cpu->nr_threads = smp_threads;
1978     cpu->stopped = true;
1979
1980     if (!cpu->as) {
1981         /* If the target cpu hasn't set up any address spaces itself,
1982          * give it the default one.
1983          */
1984         cpu->num_ases = 1;
1985         cpu_address_space_init(cpu, 0, "cpu-memory", cpu->memory);
1986     }
1987
1988     if (kvm_enabled()) {
1989         qemu_kvm_start_vcpu(cpu);
1990     } else if (hax_enabled()) {
1991         qemu_hax_start_vcpu(cpu);
1992     } else if (hvf_enabled()) {
1993         qemu_hvf_start_vcpu(cpu);
1994     } else if (tcg_enabled()) {
1995         qemu_tcg_init_vcpu(cpu);
1996     } else if (whpx_enabled()) {
1997         qemu_whpx_start_vcpu(cpu);
1998     } else {
1999         qemu_dummy_start_vcpu(cpu);
2000     }
2001
2002     while (!cpu->created) {
2003         qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
2004     }
2005 }
2006
2007 void cpu_stop_current(void)
2008 {
2009     if (current_cpu) {
2010         qemu_cpu_stop(current_cpu, true);
2011     }
2012 }
2013
2014 int vm_stop(RunState state)
2015 {
2016     if (qemu_in_vcpu_thread()) {
2017         qemu_system_vmstop_request_prepare();
2018         qemu_system_vmstop_request(state);
2019         /*
2020          * FIXME: should not return to device code in case
2021          * vm_stop() has been requested.
2022          */
2023         cpu_stop_current();
2024         return 0;
2025     }
2026
2027     return do_vm_stop(state, true);
2028 }
2029
2030 /**
2031  * Prepare for (re)starting the VM.
2032  * Returns -1 if the vCPUs are not to be restarted (e.g. if they are already
2033  * running or in case of an error condition), 0 otherwise.
2034  */
2035 int vm_prepare_start(void)
2036 {
2037     RunState requested;
2038     int res = 0;
2039
2040     qemu_vmstop_requested(&requested);
2041     if (runstate_is_running() && requested == RUN_STATE__MAX) {
2042         return -1;
2043     }
2044
2045     /* Ensure that a STOP/RESUME pair of events is emitted if a
2046      * vmstop request was pending.  The BLOCK_IO_ERROR event, for
2047      * example, according to documentation is always followed by
2048      * the STOP event.
2049      */
2050     if (runstate_is_running()) {
2051         qapi_event_send_stop(&error_abort);
2052         res = -1;
2053     } else {
2054         replay_enable_events();
2055         cpu_enable_ticks();
2056         runstate_set(RUN_STATE_RUNNING);
2057         vm_state_notify(1, RUN_STATE_RUNNING);
2058     }
2059
2060     /* We are sending this now, but the CPUs will be resumed shortly later */
2061     qapi_event_send_resume(&error_abort);
2062     return res;
2063 }
2064
2065 void vm_start(void)
2066 {
2067     if (!vm_prepare_start()) {
2068         resume_all_vcpus();
2069     }
2070 }
2071
2072 /* does a state transition even if the VM is already stopped,
2073    current state is forgotten forever */
2074 int vm_stop_force_state(RunState state)
2075 {
2076     if (runstate_is_running()) {
2077         return vm_stop(state);
2078     } else {
2079         runstate_set(state);
2080
2081         bdrv_drain_all();
2082         /* Make sure to return an error if the flush in a previous vm_stop()
2083          * failed. */
2084         return bdrv_flush_all();
2085     }
2086 }
2087
2088 void list_cpus(FILE *f, fprintf_function cpu_fprintf, const char *optarg)
2089 {
2090     /* XXX: implement xxx_cpu_list for targets that still miss it */
2091 #if defined(cpu_list)
2092     cpu_list(f, cpu_fprintf);
2093 #endif
2094 }
2095
2096 CpuInfoList *qmp_query_cpus(Error **errp)
2097 {
2098     MachineState *ms = MACHINE(qdev_get_machine());
2099     MachineClass *mc = MACHINE_GET_CLASS(ms);
2100     CpuInfoList *head = NULL, *cur_item = NULL;
2101     CPUState *cpu;
2102
2103     CPU_FOREACH(cpu) {
2104         CpuInfoList *info;
2105 #if defined(TARGET_I386)
2106         X86CPU *x86_cpu = X86_CPU(cpu);
2107         CPUX86State *env = &x86_cpu->env;
2108 #elif defined(TARGET_PPC)
2109         PowerPCCPU *ppc_cpu = POWERPC_CPU(cpu);
2110         CPUPPCState *env = &ppc_cpu->env;
2111 #elif defined(TARGET_SPARC)
2112         SPARCCPU *sparc_cpu = SPARC_CPU(cpu);
2113         CPUSPARCState *env = &sparc_cpu->env;
2114 #elif defined(TARGET_RISCV)
2115         RISCVCPU *riscv_cpu = RISCV_CPU(cpu);
2116         CPURISCVState *env = &riscv_cpu->env;
2117 #elif defined(TARGET_MIPS)
2118         MIPSCPU *mips_cpu = MIPS_CPU(cpu);
2119         CPUMIPSState *env = &mips_cpu->env;
2120 #elif defined(TARGET_TRICORE)
2121         TriCoreCPU *tricore_cpu = TRICORE_CPU(cpu);
2122         CPUTriCoreState *env = &tricore_cpu->env;
2123 #elif defined(TARGET_S390X)
2124         S390CPU *s390_cpu = S390_CPU(cpu);
2125         CPUS390XState *env = &s390_cpu->env;
2126 #endif
2127
2128         cpu_synchronize_state(cpu);
2129
2130         info = g_malloc0(sizeof(*info));
2131         info->value = g_malloc0(sizeof(*info->value));
2132         info->value->CPU = cpu->cpu_index;
2133         info->value->current = (cpu == first_cpu);
2134         info->value->halted = cpu->halted;
2135         info->value->qom_path = object_get_canonical_path(OBJECT(cpu));
2136         info->value->thread_id = cpu->thread_id;
2137 #if defined(TARGET_I386)
2138         info->value->arch = CPU_INFO_ARCH_X86;
2139         info->value->u.x86.pc = env->eip + env->segs[R_CS].base;
2140 #elif defined(TARGET_PPC)
2141         info->value->arch = CPU_INFO_ARCH_PPC;
2142         info->value->u.ppc.nip = env->nip;
2143 #elif defined(TARGET_SPARC)
2144         info->value->arch = CPU_INFO_ARCH_SPARC;
2145         info->value->u.q_sparc.pc = env->pc;
2146         info->value->u.q_sparc.npc = env->npc;
2147 #elif defined(TARGET_MIPS)
2148         info->value->arch = CPU_INFO_ARCH_MIPS;
2149         info->value->u.q_mips.PC = env->active_tc.PC;
2150 #elif defined(TARGET_TRICORE)
2151         info->value->arch = CPU_INFO_ARCH_TRICORE;
2152         info->value->u.tricore.PC = env->PC;
2153 #elif defined(TARGET_S390X)
2154         info->value->arch = CPU_INFO_ARCH_S390;
2155         info->value->u.s390.cpu_state = env->cpu_state;
2156 #elif defined(TARGET_RISCV)
2157         info->value->arch = CPU_INFO_ARCH_RISCV;
2158         info->value->u.riscv.pc = env->pc;
2159 #else
2160         info->value->arch = CPU_INFO_ARCH_OTHER;
2161 #endif
2162         info->value->has_props = !!mc->cpu_index_to_instance_props;
2163         if (info->value->has_props) {
2164             CpuInstanceProperties *props;
2165             props = g_malloc0(sizeof(*props));
2166             *props = mc->cpu_index_to_instance_props(ms, cpu->cpu_index);
2167             info->value->props = props;
2168         }
2169
2170         /* XXX: waiting for the qapi to support GSList */
2171         if (!cur_item) {
2172             head = cur_item = info;
2173         } else {
2174             cur_item->next = info;
2175             cur_item = info;
2176         }
2177     }
2178
2179     return head;
2180 }
2181
2182 /*
2183  * fast means: we NEVER interrupt vCPU threads to retrieve
2184  * information from KVM.
2185  */
2186 CpuInfoFastList *qmp_query_cpus_fast(Error **errp)
2187 {
2188     MachineState *ms = MACHINE(qdev_get_machine());
2189     MachineClass *mc = MACHINE_GET_CLASS(ms);
2190     CpuInfoFastList *head = NULL, *cur_item = NULL;
2191     CPUState *cpu;
2192 #if defined(TARGET_S390X)
2193     S390CPU *s390_cpu;
2194     CPUS390XState *env;
2195 #endif
2196
2197     CPU_FOREACH(cpu) {
2198         CpuInfoFastList *info = g_malloc0(sizeof(*info));
2199         info->value = g_malloc0(sizeof(*info->value));
2200
2201         info->value->cpu_index = cpu->cpu_index;
2202         info->value->qom_path = object_get_canonical_path(OBJECT(cpu));
2203         info->value->thread_id = cpu->thread_id;
2204
2205         info->value->has_props = !!mc->cpu_index_to_instance_props;
2206         if (info->value->has_props) {
2207             CpuInstanceProperties *props;
2208             props = g_malloc0(sizeof(*props));
2209             *props = mc->cpu_index_to_instance_props(ms, cpu->cpu_index);
2210             info->value->props = props;
2211         }
2212
2213 #if defined(TARGET_S390X)
2214         s390_cpu = S390_CPU(cpu);
2215         env = &s390_cpu->env;
2216         info->value->arch = CPU_INFO_ARCH_S390;
2217         info->value->u.s390.cpu_state = env->cpu_state;
2218 #endif
2219         if (!cur_item) {
2220             head = cur_item = info;
2221         } else {
2222             cur_item->next = info;
2223             cur_item = info;
2224         }
2225     }
2226
2227     return head;
2228 }
2229
2230 void qmp_memsave(int64_t addr, int64_t size, const char *filename,
2231                  bool has_cpu, int64_t cpu_index, Error **errp)
2232 {
2233     FILE *f;
2234     uint32_t l;
2235     CPUState *cpu;
2236     uint8_t buf[1024];
2237     int64_t orig_addr = addr, orig_size = size;
2238
2239     if (!has_cpu) {
2240         cpu_index = 0;
2241     }
2242
2243     cpu = qemu_get_cpu(cpu_index);
2244     if (cpu == NULL) {
2245         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cpu-index",
2246                    "a CPU number");
2247         return;
2248     }
2249
2250     f = fopen(filename, "wb");
2251     if (!f) {
2252         error_setg_file_open(errp, errno, filename);
2253         return;
2254     }
2255
2256     while (size != 0) {
2257         l = sizeof(buf);
2258         if (l > size)
2259             l = size;
2260         if (cpu_memory_rw_debug(cpu, addr, buf, l, 0) != 0) {
2261             error_setg(errp, "Invalid addr 0x%016" PRIx64 "/size %" PRId64
2262                              " specified", orig_addr, orig_size);
2263             goto exit;
2264         }
2265         if (fwrite(buf, 1, l, f) != l) {
2266             error_setg(errp, QERR_IO_ERROR);
2267             goto exit;
2268         }
2269         addr += l;
2270         size -= l;
2271     }
2272
2273 exit:
2274     fclose(f);
2275 }
2276
2277 void qmp_pmemsave(int64_t addr, int64_t size, const char *filename,
2278                   Error **errp)
2279 {
2280     FILE *f;
2281     uint32_t l;
2282     uint8_t buf[1024];
2283
2284     f = fopen(filename, "wb");
2285     if (!f) {
2286         error_setg_file_open(errp, errno, filename);
2287         return;
2288     }
2289
2290     while (size != 0) {
2291         l = sizeof(buf);
2292         if (l > size)
2293             l = size;
2294         cpu_physical_memory_read(addr, buf, l);
2295         if (fwrite(buf, 1, l, f) != l) {
2296             error_setg(errp, QERR_IO_ERROR);
2297             goto exit;
2298         }
2299         addr += l;
2300         size -= l;
2301     }
2302
2303 exit:
2304     fclose(f);
2305 }
2306
2307 void qmp_inject_nmi(Error **errp)
2308 {
2309     nmi_monitor_handle(monitor_get_cpu_index(), errp);
2310 }
2311
2312 void dump_drift_info(FILE *f, fprintf_function cpu_fprintf)
2313 {
2314     if (!use_icount) {
2315         return;
2316     }
2317
2318     cpu_fprintf(f, "Host - Guest clock  %"PRIi64" ms\n",
2319                 (cpu_get_clock() - cpu_get_icount())/SCALE_MS);
2320     if (icount_align_option) {
2321         cpu_fprintf(f, "Max guest delay     %"PRIi64" ms\n", -max_delay/SCALE_MS);
2322         cpu_fprintf(f, "Max guest advance   %"PRIi64" ms\n", max_advance/SCALE_MS);
2323     } else {
2324         cpu_fprintf(f, "Max guest delay     NA\n");
2325         cpu_fprintf(f, "Max guest advance   NA\n");
2326     }
2327 }