cpus.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a copy
   7  * of this software and associated documentation files (the "Software"), to deal
   8  * in the Software without restriction, including without limitation the rights
   9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10  * copies of the Software, and to permit persons to whom the Software is
  11  * furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in
  14  * all copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22  * THE SOFTWARE.
  23  */
  24
  25 #include "qemu/osdep.h"
  26 #include "qemu/config-file.h"
  27 #include "cpu.h"
  28 #include "monitor/monitor.h"
  29 #include "qapi/error.h"
  30 #include "qapi/qapi-commands-misc.h"
  31 #include "qapi/qapi-events-run-state.h"
  32 #include "qapi/qmp/qerror.h"
  33 #include "qemu/error-report.h"
  34 #include "sysemu/sysemu.h"
  35 #include "sysemu/block-backend.h"
  36 #include "exec/gdbstub.h"
  37 #include "sysemu/dma.h"
  38 #include "sysemu/hw_accel.h"
  39 #include "sysemu/kvm.h"
  40 #include "sysemu/hax.h"
  41 #include "sysemu/hvf.h"
  42 #include "sysemu/whpx.h"
  43 #include "exec/exec-all.h"
  44
  45 #include "qemu/thread.h"
  46 #include "sysemu/cpus.h"
  47 #include "sysemu/qtest.h"
  48 #include "qemu/main-loop.h"
  49 #include "qemu/option.h"
  50 #include "qemu/bitmap.h"
  51 #include "qemu/seqlock.h"
  52 #include "tcg.h"
  53 #include "hw/nmi.h"
  54 #include "sysemu/replay.h"
  55 #include "hw/boards.h"
  56
  57 #ifdef CONFIG_LINUX
  58
  59 #include <sys/prctl.h>
  60
  61 #ifndef PR_MCE_KILL
  62 #define PR_MCE_KILL 33
  63 #endif
  64
  65 #ifndef PR_MCE_KILL_SET
  66 #define PR_MCE_KILL_SET 1
  67 #endif
  68
  69 #ifndef PR_MCE_KILL_EARLY
  70 #define PR_MCE_KILL_EARLY 1
  71 #endif
  72
  73 #endif /* CONFIG_LINUX */
  74
  75 int64_t max_delay;
  76 int64_t max_advance;
  77
  78 /* vcpu throttling controls */
  79 static QEMUTimer *throttle_timer;
  80 static unsigned int throttle_percentage;
  81
  82 #define CPU_THROTTLE_PCT_MIN 1
  83 #define CPU_THROTTLE_PCT_MAX 99
  84 #define CPU_THROTTLE_TIMESLICE_NS 10000000
  85
  86 bool cpu_is_stopped(CPUState *cpu)
  87 {
  88     return cpu->stopped || !runstate_is_running();
  89 }
  90
  91 static bool cpu_thread_is_idle(CPUState *cpu)
  92 {
  93     if (cpu->stop || cpu->queued_work_first) {
  94         return false;
  95     }
  96     if (cpu_is_stopped(cpu)) {
  97         return true;
  98     }
  99     if (!cpu->halted || cpu_has_work(cpu) ||
 100         kvm_halt_in_kernel()) {
 101         return false;
 102     }
 103     return true;
 104 }
 105
 106 static bool all_cpu_threads_idle(void)
 107 {
 108     CPUState *cpu;
 109
 110     CPU_FOREACH(cpu) {
 111         if (!cpu_thread_is_idle(cpu)) {
 112             return false;
 113         }
 114     }
 115     return true;
 116 }
 117
 118 /***********************************************************/
 119 /* guest cycle counter */
 120
 121 /* Protected by TimersState seqlock */
 122
 123 static bool icount_sleep = true;
 124 /* Arbitrarily pick 1MIPS as the minimum allowable speed.  */
 125 #define MAX_ICOUNT_SHIFT 10
 126
 127 typedef struct TimersState {
 128     /* Protected by BQL.  */
 129     int64_t cpu_ticks_prev;
 130     int64_t cpu_ticks_offset;
 131
 132     /* Protect fields that can be respectively read outside the
 133      * BQL, and written from multiple threads.
 134      */
 135     QemuSeqLock vm_clock_seqlock;
 136     QemuSpin vm_clock_lock;
 137
 138     int16_t cpu_ticks_enabled;
 139
 140     /* Conversion factor from emulated instructions to virtual clock ticks.  */
 141     int16_t icount_time_shift;
 142
 143     /* Compensate for varying guest execution speed.  */
 144     int64_t qemu_icount_bias;
 145
 146     int64_t vm_clock_warp_start;
 147     int64_t cpu_clock_offset;
 148
 149     /* Only written by TCG thread */
 150     int64_t qemu_icount;
 151
 152     /* for adjusting icount */
 153     QEMUTimer *icount_rt_timer;
 154     QEMUTimer *icount_vm_timer;
 155     QEMUTimer *icount_warp_timer;
 156 } TimersState;
 157
 158 static TimersState timers_state;
 159 bool mttcg_enabled;
 160
 161 /*
 162  * We default to false if we know other options have been enabled
 163  * which are currently incompatible with MTTCG. Otherwise when each
 164  * guest (target) has been updated to support:
 165  *   - atomic instructions
 166  *   - memory ordering primitives (barriers)
 167  * they can set the appropriate CONFIG flags in ${target}-softmmu.mak
 168  *
 169  * Once a guest architecture has been converted to the new primitives
 170  * there are two remaining limitations to check.
 171  *
 172  * - The guest can't be oversized (e.g. 64 bit guest on 32 bit host)
 173  * - The host must have a stronger memory order than the guest
 174  *
 175  * It may be possible in future to support strong guests on weak hosts
 176  * but that will require tagging all load/stores in a guest with their
 177  * implicit memory order requirements which would likely slow things
 178  * down a lot.
 179  */
 180
 181 static bool check_tcg_memory_orders_compatible(void)
 182 {
 183 #if defined(TCG_GUEST_DEFAULT_MO) && defined(TCG_TARGET_DEFAULT_MO)
 184     return (TCG_GUEST_DEFAULT_MO & ~TCG_TARGET_DEFAULT_MO) == 0;
 185 #else
 186     return false;
 187 #endif
 188 }
 189
 190 static bool default_mttcg_enabled(void)
 191 {
 192     if (use_icount || TCG_OVERSIZED_GUEST) {
 193         return false;
 194     } else {
 195 #ifdef TARGET_SUPPORTS_MTTCG
 196         return check_tcg_memory_orders_compatible();
 197 #else
 198         return false;
 199 #endif
 200     }
 201 }
 202
 203 void qemu_tcg_configure(QemuOpts *opts, Error **errp)
 204 {
 205     const char *t = qemu_opt_get(opts, "thread");
 206     if (t) {
 207         if (strcmp(t, "multi") == 0) {
 208             if (TCG_OVERSIZED_GUEST) {
 209                 error_setg(errp, "No MTTCG when guest word size > hosts");
 210             } else if (use_icount) {
 211                 error_setg(errp, "No MTTCG when icount is enabled");
 212             } else {
 213 #ifndef TARGET_SUPPORTS_MTTCG
 214                 error_report("Guest not yet converted to MTTCG - "
 215                              "you may get unexpected results");
 216 #endif
 217                 if (!check_tcg_memory_orders_compatible()) {
 218                     error_report("Guest expects a stronger memory ordering "
 219                                  "than the host provides");
 220                     error_printf("This may cause strange/hard to debug errors\n");
 221                 }
 222                 mttcg_enabled = true;
 223             }
 224         } else if (strcmp(t, "single") == 0) {
 225             mttcg_enabled = false;
 226         } else {
 227             error_setg(errp, "Invalid 'thread' setting %s", t);
 228         }
 229     } else {
 230         mttcg_enabled = default_mttcg_enabled();
 231     }
 232 }
 233
 234 /* The current number of executed instructions is based on what we
 235  * originally budgeted minus the current state of the decrementing
 236  * icount counters in extra/u16.low.
 237  */
 238 static int64_t cpu_get_icount_executed(CPUState *cpu)
 239 {
 240     return cpu->icount_budget - (cpu->icount_decr.u16.low + cpu->icount_extra);
 241 }
 242
 243 /*
 244  * Update the global shared timer_state.qemu_icount to take into
 245  * account executed instructions. This is done by the TCG vCPU
 246  * thread so the main-loop can see time has moved forward.
 247  */
 248 void cpu_update_icount(CPUState *cpu)
 249 {
 250     int64_t executed = cpu_get_icount_executed(cpu);
 251     cpu->icount_budget -= executed;
 252
 253 #ifndef CONFIG_ATOMIC64
 254     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 255                        &timers_state.vm_clock_lock);
 256 #endif
 257     atomic_set__nocheck(&timers_state.qemu_icount,
 258                         timers_state.qemu_icount + executed);
 259 #ifndef CONFIG_ATOMIC64
 260     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 261                          &timers_state.vm_clock_lock);
 262 #endif
 263 }
 264
 265 static int64_t cpu_get_icount_raw_locked(void)
 266 {
 267     CPUState *cpu = current_cpu;
 268
 269     if (cpu && cpu->running) {
 270         if (!cpu->can_do_io) {
 271             error_report("Bad icount read");
 272             exit(1);
 273         }
 274         /* Take into account what has run */
 275         cpu_update_icount(cpu);
 276     }
 277     /* The read is protected by the seqlock, so __nocheck is okay.  */
 278     return atomic_read__nocheck(&timers_state.qemu_icount);
 279 }
 280
 281 static int64_t cpu_get_icount_locked(void)
 282 {
 283     int64_t icount = cpu_get_icount_raw_locked();
 284     return atomic_read__nocheck(&timers_state.qemu_icount_bias) + cpu_icount_to_ns(icount);
 285 }
 286
 287 int64_t cpu_get_icount_raw(void)
 288 {
 289     int64_t icount;
 290     unsigned start;
 291
 292     do {
 293         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 294         icount = cpu_get_icount_raw_locked();
 295     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 296
 297     return icount;
 298 }
 299
 300 /* Return the virtual CPU time, based on the instruction counter.  */
 301 int64_t cpu_get_icount(void)
 302 {
 303     int64_t icount;
 304     unsigned start;
 305
 306     do {
 307         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 308         icount = cpu_get_icount_locked();
 309     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 310
 311     return icount;
 312 }
 313
 314 int64_t cpu_icount_to_ns(int64_t icount)
 315 {
 316     return icount << atomic_read(&timers_state.icount_time_shift);
 317 }
 318
 319 static int64_t cpu_get_ticks_locked(void)
 320 {
 321     int64_t ticks = timers_state.cpu_ticks_offset;
 322     if (timers_state.cpu_ticks_enabled) {
 323         ticks += cpu_get_host_ticks();
 324     }
 325
 326     if (timers_state.cpu_ticks_prev > ticks) {
 327         /* Non increasing ticks may happen if the host uses software suspend.  */
 328         timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks;
 329         ticks = timers_state.cpu_ticks_prev;
 330     }
 331
 332     timers_state.cpu_ticks_prev = ticks;
 333     return ticks;
 334 }
 335
 336 /* return the time elapsed in VM between vm_start and vm_stop.  Unless
 337  * icount is active, cpu_get_ticks() uses units of the host CPU cycle
 338  * counter.
 339  */
 340 int64_t cpu_get_ticks(void)
 341 {
 342     int64_t ticks;
 343
 344     if (use_icount) {
 345         return cpu_get_icount();
 346     }
 347
 348     qemu_spin_lock(&timers_state.vm_clock_lock);
 349     ticks = cpu_get_ticks_locked();
 350     qemu_spin_unlock(&timers_state.vm_clock_lock);
 351     return ticks;
 352 }
 353
 354 static int64_t cpu_get_clock_locked(void)
 355 {
 356     int64_t time;
 357
 358     time = timers_state.cpu_clock_offset;
 359     if (timers_state.cpu_ticks_enabled) {
 360         time += get_clock();
 361     }
 362
 363     return time;
 364 }
 365
 366 /* Return the monotonic time elapsed in VM, i.e.,
 367  * the time between vm_start and vm_stop
 368  */
 369 int64_t cpu_get_clock(void)
 370 {
 371     int64_t ti;
 372     unsigned start;
 373
 374     do {
 375         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 376         ti = cpu_get_clock_locked();
 377     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 378
 379     return ti;
 380 }
 381
 382 /* enable cpu_get_ticks()
 383  * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
 384  */
 385 void cpu_enable_ticks(void)
 386 {
 387     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 388                        &timers_state.vm_clock_lock);
 389     if (!timers_state.cpu_ticks_enabled) {
 390         timers_state.cpu_ticks_offset -= cpu_get_host_ticks();
 391         timers_state.cpu_clock_offset -= get_clock();
 392         timers_state.cpu_ticks_enabled = 1;
 393     }
 394     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 395                        &timers_state.vm_clock_lock);
 396 }
 397
 398 /* disable cpu_get_ticks() : the clock is stopped. You must not call
 399  * cpu_get_ticks() after that.
 400  * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
 401  */
 402 void cpu_disable_ticks(void)
 403 {
 404     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 405                        &timers_state.vm_clock_lock);
 406     if (timers_state.cpu_ticks_enabled) {
 407         timers_state.cpu_ticks_offset += cpu_get_host_ticks();
 408         timers_state.cpu_clock_offset = cpu_get_clock_locked();
 409         timers_state.cpu_ticks_enabled = 0;
 410     }
 411     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 412                          &timers_state.vm_clock_lock);
 413 }
 414
 415 /* Correlation between real and virtual time is always going to be
 416    fairly approximate, so ignore small variation.
 417    When the guest is idle real and virtual time will be aligned in
 418    the IO wait loop.  */
 419 #define ICOUNT_WOBBLE (NANOSECONDS_PER_SECOND / 10)
 420
 421 static void icount_adjust(void)
 422 {
 423     int64_t cur_time;
 424     int64_t cur_icount;
 425     int64_t delta;
 426
 427     /* Protected by TimersState mutex.  */
 428     static int64_t last_delta;
 429
 430     /* If the VM is not running, then do nothing.  */
 431     if (!runstate_is_running()) {
 432         return;
 433     }
 434
 435     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 436                        &timers_state.vm_clock_lock);
 437     cur_time = cpu_get_clock_locked();
 438     cur_icount = cpu_get_icount_locked();
 439
 440     delta = cur_icount - cur_time;
 441     /* FIXME: This is a very crude algorithm, somewhat prone to oscillation.  */
 442     if (delta > 0
 443         && last_delta + ICOUNT_WOBBLE < delta * 2
 444         && timers_state.icount_time_shift > 0) {
 445         /* The guest is getting too far ahead.  Slow time down.  */
 446         atomic_set(&timers_state.icount_time_shift,
 447                    timers_state.icount_time_shift - 1);
 448     }
 449     if (delta < 0
 450         && last_delta - ICOUNT_WOBBLE > delta * 2
 451         && timers_state.icount_time_shift < MAX_ICOUNT_SHIFT) {
 452         /* The guest is getting too far behind.  Speed time up.  */
 453         atomic_set(&timers_state.icount_time_shift,
 454                    timers_state.icount_time_shift + 1);
 455     }
 456     last_delta = delta;
 457     atomic_set__nocheck(&timers_state.qemu_icount_bias,
 458                         cur_icount - (timers_state.qemu_icount
 459                                       << timers_state.icount_time_shift));
 460     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 461                          &timers_state.vm_clock_lock);
 462 }
 463
 464 static void icount_adjust_rt(void *opaque)
 465 {
 466     timer_mod(timers_state.icount_rt_timer,
 467               qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
 468     icount_adjust();
 469 }
 470
 471 static void icount_adjust_vm(void *opaque)
 472 {
 473     timer_mod(timers_state.icount_vm_timer,
 474                    qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 475                    NANOSECONDS_PER_SECOND / 10);
 476     icount_adjust();
 477 }
 478
 479 static int64_t qemu_icount_round(int64_t count)
 480 {
 481     int shift = atomic_read(&timers_state.icount_time_shift);
 482     return (count + (1 << shift) - 1) >> shift;
 483 }
 484
 485 static void icount_warp_rt(void)
 486 {
 487     unsigned seq;
 488     int64_t warp_start;
 489
 490     /* The icount_warp_timer is rescheduled soon after vm_clock_warp_start
 491      * changes from -1 to another value, so the race here is okay.
 492      */
 493     do {
 494         seq = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 495         warp_start = timers_state.vm_clock_warp_start;
 496     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, seq));
 497
 498     if (warp_start == -1) {
 499         return;
 500     }
 501
 502     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 503                        &timers_state.vm_clock_lock);
 504     if (runstate_is_running()) {
 505         int64_t clock = REPLAY_CLOCK(REPLAY_CLOCK_VIRTUAL_RT,
 506                                      cpu_get_clock_locked());
 507         int64_t warp_delta;
 508
 509         warp_delta = clock - timers_state.vm_clock_warp_start;
 510         if (use_icount == 2) {
 511             /*
 512              * In adaptive mode, do not let QEMU_CLOCK_VIRTUAL run too
 513              * far ahead of real time.
 514              */
 515             int64_t cur_icount = cpu_get_icount_locked();
 516             int64_t delta = clock - cur_icount;
 517             warp_delta = MIN(warp_delta, delta);
 518         }
 519         atomic_set__nocheck(&timers_state.qemu_icount_bias,
 520                             timers_state.qemu_icount_bias + warp_delta);
 521     }
 522     timers_state.vm_clock_warp_start = -1;
 523     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 524                        &timers_state.vm_clock_lock);
 525
 526     if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL)) {
 527         qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 528     }
 529 }
 530
 531 static void icount_timer_cb(void *opaque)
 532 {
 533     /* No need for a checkpoint because the timer already synchronizes
 534      * with CHECKPOINT_CLOCK_VIRTUAL_RT.
 535      */
 536     icount_warp_rt();
 537 }
 538
 539 void qtest_clock_warp(int64_t dest)
 540 {
 541     int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 542     AioContext *aio_context;
 543     assert(qtest_enabled());
 544     aio_context = qemu_get_aio_context();
 545     while (clock < dest) {
 546         int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
 547         int64_t warp = qemu_soonest_timeout(dest - clock, deadline);
 548
 549         seqlock_write_lock(&timers_state.vm_clock_seqlock,
 550                            &timers_state.vm_clock_lock);
 551         atomic_set__nocheck(&timers_state.qemu_icount_bias,
 552                             timers_state.qemu_icount_bias + warp);
 553         seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 554                              &timers_state.vm_clock_lock);
 555
 556         qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
 557         timerlist_run_timers(aio_context->tlg.tl[QEMU_CLOCK_VIRTUAL]);
 558         clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 559     }
 560     qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 561 }
 562
 563 void qemu_start_warp_timer(void)
 564 {
 565     int64_t clock;
 566     int64_t deadline;
 567
 568     if (!use_icount) {
 569         return;
 570     }
 571
 572     /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
 573      * do not fire, so computing the deadline does not make sense.
 574      */
 575     if (!runstate_is_running()) {
 576         return;
 577     }
 578
 579     /* warp clock deterministically in record/replay mode */
 580     if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_START)) {
 581         return;
 582     }
 583
 584     if (!all_cpu_threads_idle()) {
 585         return;
 586     }
 587
 588     if (qtest_enabled()) {
 589         /* When testing, qtest commands advance icount.  */
 590         return;
 591     }
 592
 593     /* We want to use the earliest deadline from ALL vm_clocks */
 594     clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT);
 595     deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
 596     if (deadline < 0) {
 597         static bool notified;
 598         if (!icount_sleep && !notified) {
 599             warn_report("icount sleep disabled and no active timers");
 600             notified = true;
 601         }
 602         return;
 603     }
 604
 605     if (deadline > 0) {
 606         /*
 607          * Ensure QEMU_CLOCK_VIRTUAL proceeds even when the virtual CPU goes to
 608          * sleep.  Otherwise, the CPU might be waiting for a future timer
 609          * interrupt to wake it up, but the interrupt never comes because
 610          * the vCPU isn't running any insns and thus doesn't advance the
 611          * QEMU_CLOCK_VIRTUAL.
 612          */
 613         if (!icount_sleep) {
 614             /*
 615              * We never let VCPUs sleep in no sleep icount mode.
 616              * If there is a pending QEMU_CLOCK_VIRTUAL timer we just advance
 617              * to the next QEMU_CLOCK_VIRTUAL event and notify it.
 618              * It is useful when we want a deterministic execution time,
 619              * isolated from host latencies.
 620              */
 621             seqlock_write_lock(&timers_state.vm_clock_seqlock,
 622                                &timers_state.vm_clock_lock);
 623             atomic_set__nocheck(&timers_state.qemu_icount_bias,
 624                                 timers_state.qemu_icount_bias + deadline);
 625             seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 626                                  &timers_state.vm_clock_lock);
 627             qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 628         } else {
 629             /*
 630              * We do stop VCPUs and only advance QEMU_CLOCK_VIRTUAL after some
 631              * "real" time, (related to the time left until the next event) has
 632              * passed. The QEMU_CLOCK_VIRTUAL_RT clock will do this.
 633              * This avoids that the warps are visible externally; for example,
 634              * you will not be sending network packets continuously instead of
 635              * every 100ms.
 636              */
 637             seqlock_write_lock(&timers_state.vm_clock_seqlock,
 638                                &timers_state.vm_clock_lock);
 639             if (timers_state.vm_clock_warp_start == -1
 640                 || timers_state.vm_clock_warp_start > clock) {
 641                 timers_state.vm_clock_warp_start = clock;
 642             }
 643             seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 644                                  &timers_state.vm_clock_lock);
 645             timer_mod_anticipate(timers_state.icount_warp_timer,
 646                                  clock + deadline);
 647         }
 648     } else if (deadline == 0) {
 649         qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 650     }
 651 }
 652
 653 static void qemu_account_warp_timer(void)
 654 {
 655     if (!use_icount || !icount_sleep) {
 656         return;
 657     }
 658
 659     /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
 660      * do not fire, so computing the deadline does not make sense.
 661      */
 662     if (!runstate_is_running()) {
 663         return;
 664     }
 665
 666     /* warp clock deterministically in record/replay mode */
 667     if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_ACCOUNT)) {
 668         return;
 669     }
 670
 671     timer_del(timers_state.icount_warp_timer);
 672     icount_warp_rt();
 673 }
 674
 675 static bool icount_state_needed(void *opaque)
 676 {
 677     return use_icount;
 678 }
 679
 680 static bool warp_timer_state_needed(void *opaque)
 681 {
 682     TimersState *s = opaque;
 683     return s->icount_warp_timer != NULL;
 684 }
 685
 686 static bool adjust_timers_state_needed(void *opaque)
 687 {
 688     TimersState *s = opaque;
 689     return s->icount_rt_timer != NULL;
 690 }
 691
 692 /*
 693  * Subsection for warp timer migration is optional, because may not be created
 694  */
 695 static const VMStateDescription icount_vmstate_warp_timer = {
 696     .name = "timer/icount/warp_timer",
 697     .version_id = 1,
 698     .minimum_version_id = 1,
 699     .needed = warp_timer_state_needed,
 700     .fields = (VMStateField[]) {
 701         VMSTATE_INT64(vm_clock_warp_start, TimersState),
 702         VMSTATE_TIMER_PTR(icount_warp_timer, TimersState),
 703         VMSTATE_END_OF_LIST()
 704     }
 705 };
 706
 707 static const VMStateDescription icount_vmstate_adjust_timers = {
 708     .name = "timer/icount/timers",
 709     .version_id = 1,
 710     .minimum_version_id = 1,
 711     .needed = adjust_timers_state_needed,
 712     .fields = (VMStateField[]) {
 713         VMSTATE_TIMER_PTR(icount_rt_timer, TimersState),
 714         VMSTATE_TIMER_PTR(icount_vm_timer, TimersState),
 715         VMSTATE_END_OF_LIST()
 716     }
 717 };
 718
 719 /*
 720  * This is a subsection for icount migration.
 721  */
 722 static const VMStateDescription icount_vmstate_timers = {
 723     .name = "timer/icount",
 724     .version_id = 1,
 725     .minimum_version_id = 1,
 726     .needed = icount_state_needed,
 727     .fields = (VMStateField[]) {
 728         VMSTATE_INT64(qemu_icount_bias, TimersState),
 729         VMSTATE_INT64(qemu_icount, TimersState),
 730         VMSTATE_END_OF_LIST()
 731     },
 732     .subsections = (const VMStateDescription*[]) {
 733         &icount_vmstate_warp_timer,
 734         &icount_vmstate_adjust_timers,
 735         NULL
 736     }
 737 };
 738
 739 static const VMStateDescription vmstate_timers = {
 740     .name = "timer",
 741     .version_id = 2,
 742     .minimum_version_id = 1,
 743     .fields = (VMStateField[]) {
 744         VMSTATE_INT64(cpu_ticks_offset, TimersState),
 745         VMSTATE_UNUSED(8),
 746         VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2),
 747         VMSTATE_END_OF_LIST()
 748     },
 749     .subsections = (const VMStateDescription*[]) {
 750         &icount_vmstate_timers,
 751         NULL
 752     }
 753 };
 754
 755 static void cpu_throttle_thread(CPUState *cpu, run_on_cpu_data opaque)
 756 {
 757     double pct;
 758     double throttle_ratio;
 759     long sleeptime_ns;
 760
 761     if (!cpu_throttle_get_percentage()) {
 762         return;
 763     }
 764
 765     pct = (double)cpu_throttle_get_percentage()/100;
 766     throttle_ratio = pct / (1 - pct);
 767     sleeptime_ns = (long)(throttle_ratio * CPU_THROTTLE_TIMESLICE_NS);
 768
 769     qemu_mutex_unlock_iothread();
 770     g_usleep(sleeptime_ns / 1000); /* Convert ns to us for usleep call */
 771     qemu_mutex_lock_iothread();
 772     atomic_set(&cpu->throttle_thread_scheduled, 0);
 773 }
 774
 775 static void cpu_throttle_timer_tick(void *opaque)
 776 {
 777     CPUState *cpu;
 778     double pct;
 779
 780     /* Stop the timer if needed */
 781     if (!cpu_throttle_get_percentage()) {
 782         return;
 783     }
 784     CPU_FOREACH(cpu) {
 785         if (!atomic_xchg(&cpu->throttle_thread_scheduled, 1)) {
 786             async_run_on_cpu(cpu, cpu_throttle_thread,
 787                              RUN_ON_CPU_NULL);
 788         }
 789     }
 790
 791     pct = (double)cpu_throttle_get_percentage()/100;
 792     timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
 793                                    CPU_THROTTLE_TIMESLICE_NS / (1-pct));
 794 }
 795
 796 void cpu_throttle_set(int new_throttle_pct)
 797 {
 798     /* Ensure throttle percentage is within valid range */
 799     new_throttle_pct = MIN(new_throttle_pct, CPU_THROTTLE_PCT_MAX);
 800     new_throttle_pct = MAX(new_throttle_pct, CPU_THROTTLE_PCT_MIN);
 801
 802     atomic_set(&throttle_percentage, new_throttle_pct);
 803
 804     timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
 805                                        CPU_THROTTLE_TIMESLICE_NS);
 806 }
 807
 808 void cpu_throttle_stop(void)
 809 {
 810     atomic_set(&throttle_percentage, 0);
 811 }
 812
 813 bool cpu_throttle_active(void)
 814 {
 815     return (cpu_throttle_get_percentage() != 0);
 816 }
 817
 818 int cpu_throttle_get_percentage(void)
 819 {
 820     return atomic_read(&throttle_percentage);
 821 }
 822
 823 void cpu_ticks_init(void)
 824 {
 825     seqlock_init(&timers_state.vm_clock_seqlock);
 826     vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
 827     throttle_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
 828                                            cpu_throttle_timer_tick, NULL);
 829 }
 830
 831 void configure_icount(QemuOpts *opts, Error **errp)
 832 {
 833     const char *option;
 834     char *rem_str = NULL;
 835
 836     option = qemu_opt_get(opts, "shift");
 837     if (!option) {
 838         if (qemu_opt_get(opts, "align") != NULL) {
 839             error_setg(errp, "Please specify shift option when using align");
 840         }
 841         return;
 842     }
 843
 844     icount_sleep = qemu_opt_get_bool(opts, "sleep", true);
 845     if (icount_sleep) {
 846         timers_state.icount_warp_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
 847                                          icount_timer_cb, NULL);
 848     }
 849
 850     icount_align_option = qemu_opt_get_bool(opts, "align", false);
 851
 852     if (icount_align_option && !icount_sleep) {
 853         error_setg(errp, "align=on and sleep=off are incompatible");
 854     }
 855     if (strcmp(option, "auto") != 0) {
 856         errno = 0;
 857         timers_state.icount_time_shift = strtol(option, &rem_str, 0);
 858         if (errno != 0 || *rem_str != '\0' || !strlen(option)) {
 859             error_setg(errp, "icount: Invalid shift value");
 860         }
 861         use_icount = 1;
 862         return;
 863     } else if (icount_align_option) {
 864         error_setg(errp, "shift=auto and align=on are incompatible");
 865     } else if (!icount_sleep) {
 866         error_setg(errp, "shift=auto and sleep=off are incompatible");
 867     }
 868
 869     use_icount = 2;
 870
 871     /* 125MIPS seems a reasonable initial guess at the guest speed.
 872        It will be corrected fairly quickly anyway.  */
 873     timers_state.icount_time_shift = 3;
 874
 875     /* Have both realtime and virtual time triggers for speed adjustment.
 876        The realtime trigger catches emulated time passing too slowly,
 877        the virtual time trigger catches emulated time passing too fast.
 878        Realtime triggers occur even when idle, so use them less frequently
 879        than VM triggers.  */
 880     timers_state.vm_clock_warp_start = -1;
 881     timers_state.icount_rt_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL_RT,
 882                                    icount_adjust_rt, NULL);
 883     timer_mod(timers_state.icount_rt_timer,
 884                    qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
 885     timers_state.icount_vm_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 886                                         icount_adjust_vm, NULL);
 887     timer_mod(timers_state.icount_vm_timer,
 888                    qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 889                    NANOSECONDS_PER_SECOND / 10);
 890 }
 891
 892 /***********************************************************/
 893 /* TCG vCPU kick timer
 894  *
 895  * The kick timer is responsible for moving single threaded vCPU
 896  * emulation on to the next vCPU. If more than one vCPU is running a
 897  * timer event with force a cpu->exit so the next vCPU can get
 898  * scheduled.
 899  *
 900  * The timer is removed if all vCPUs are idle and restarted again once
 901  * idleness is complete.
 902  */
 903
 904 static QEMUTimer *tcg_kick_vcpu_timer;
 905 static CPUState *tcg_current_rr_cpu;
 906
 907 #define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
 908
 909 static inline int64_t qemu_tcg_next_kick(void)
 910 {
 911     return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + TCG_KICK_PERIOD;
 912 }
 913
 914 /* Kick the currently round-robin scheduled vCPU */
 915 static void qemu_cpu_kick_rr_cpu(void)
 916 {
 917     CPUState *cpu;
 918     do {
 919         cpu = atomic_mb_read(&tcg_current_rr_cpu);
 920         if (cpu) {
 921             cpu_exit(cpu);
 922         }
 923     } while (cpu != atomic_mb_read(&tcg_current_rr_cpu));
 924 }
 925
 926 static void do_nothing(CPUState *cpu, run_on_cpu_data unused)
 927 {
 928 }
 929
 930 void qemu_timer_notify_cb(void *opaque, QEMUClockType type)
 931 {
 932     if (!use_icount || type != QEMU_CLOCK_VIRTUAL) {
 933         qemu_notify_event();
 934         return;
 935     }
 936
 937     if (qemu_in_vcpu_thread()) {
 938         /* A CPU is currently running; kick it back out to the
 939          * tcg_cpu_exec() loop so it will recalculate its
 940          * icount deadline immediately.
 941          */
 942         qemu_cpu_kick(current_cpu);
 943     } else if (first_cpu) {
 944         /* qemu_cpu_kick is not enough to kick a halted CPU out of
 945          * qemu_tcg_wait_io_event.  async_run_on_cpu, instead,
 946          * causes cpu_thread_is_idle to return false.  This way,
 947          * handle_icount_deadline can run.
 948          * If we have no CPUs at all for some reason, we don't
 949          * need to do anything.
 950          */
 951         async_run_on_cpu(first_cpu, do_nothing, RUN_ON_CPU_NULL);
 952     }
 953 }
 954
 955 static void kick_tcg_thread(void *opaque)
 956 {
 957     timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 958     qemu_cpu_kick_rr_cpu();
 959 }
 960
 961 static void start_tcg_kick_timer(void)
 962 {
 963     assert(!mttcg_enabled);
 964     if (!tcg_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
 965         tcg_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 966                                            kick_tcg_thread, NULL);
 967         timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 968     }
 969 }
 970
 971 static void stop_tcg_kick_timer(void)
 972 {
 973     assert(!mttcg_enabled);
 974     if (tcg_kick_vcpu_timer) {
 975         timer_del(tcg_kick_vcpu_timer);
 976         tcg_kick_vcpu_timer = NULL;
 977     }
 978 }
 979
 980 /***********************************************************/
 981 void hw_error(const char *fmt, ...)
 982 {
 983     va_list ap;
 984     CPUState *cpu;
 985
 986     va_start(ap, fmt);
 987     fprintf(stderr, "qemu: hardware error: ");
 988     vfprintf(stderr, fmt, ap);
 989     fprintf(stderr, "\n");
 990     CPU_FOREACH(cpu) {
 991         fprintf(stderr, "CPU #%d:\n", cpu->cpu_index);
 992         cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_FPU);
 993     }
 994     va_end(ap);
 995     abort();
 996 }
 997
 998 void cpu_synchronize_all_states(void)
 999 {
1000     CPUState *cpu;
1001
1002     CPU_FOREACH(cpu) {
1003         cpu_synchronize_state(cpu);
1004         /* TODO: move to cpu_synchronize_state() */
1005         if (hvf_enabled()) {
1006             hvf_cpu_synchronize_state(cpu);
1007         }
1008     }
1009 }
1010
1011 void cpu_synchronize_all_post_reset(void)
1012 {
1013     CPUState *cpu;
1014
1015     CPU_FOREACH(cpu) {
1016         cpu_synchronize_post_reset(cpu);
1017         /* TODO: move to cpu_synchronize_post_reset() */
1018         if (hvf_enabled()) {
1019             hvf_cpu_synchronize_post_reset(cpu);
1020         }
1021     }
1022 }
1023
1024 void cpu_synchronize_all_post_init(void)
1025 {
1026     CPUState *cpu;
1027
1028     CPU_FOREACH(cpu) {
1029         cpu_synchronize_post_init(cpu);
1030         /* TODO: move to cpu_synchronize_post_init() */
1031         if (hvf_enabled()) {
1032             hvf_cpu_synchronize_post_init(cpu);
1033         }
1034     }
1035 }
1036
1037 void cpu_synchronize_all_pre_loadvm(void)
1038 {
1039     CPUState *cpu;
1040
1041     CPU_FOREACH(cpu) {
1042         cpu_synchronize_pre_loadvm(cpu);
1043     }
1044 }
1045
1046 static int do_vm_stop(RunState state, bool send_stop)
1047 {
1048     int ret = 0;
1049
1050     if (runstate_is_running()) {
1051         cpu_disable_ticks();
1052         pause_all_vcpus();
1053         runstate_set(state);
1054         vm_state_notify(0, state);
1055         if (send_stop) {
1056             qapi_event_send_stop(&error_abort);
1057         }
1058     }
1059
1060     bdrv_drain_all();
1061     replay_disable_events();
1062     ret = bdrv_flush_all();
1063
1064     return ret;
1065 }
1066
1067 /* Special vm_stop() variant for terminating the process.  Historically clients
1068  * did not expect a QMP STOP event and so we need to retain compatibility.
1069  */
1070 int vm_shutdown(void)
1071 {
1072     return do_vm_stop(RUN_STATE_SHUTDOWN, false);
1073 }
1074
1075 static bool cpu_can_run(CPUState *cpu)
1076 {
1077     if (cpu->stop) {
1078         return false;
1079     }
1080     if (cpu_is_stopped(cpu)) {
1081         return false;
1082     }
1083     return true;
1084 }
1085
1086 static void cpu_handle_guest_debug(CPUState *cpu)
1087 {
1088     gdb_set_stop_cpu(cpu);
1089     qemu_system_debug_request();
1090     cpu->stopped = true;
1091 }
1092
1093 #ifdef CONFIG_LINUX
1094 static void sigbus_reraise(void)
1095 {
1096     sigset_t set;
1097     struct sigaction action;
1098
1099     memset(&action, 0, sizeof(action));
1100     action.sa_handler = SIG_DFL;
1101     if (!sigaction(SIGBUS, &action, NULL)) {
1102         raise(SIGBUS);
1103         sigemptyset(&set);
1104         sigaddset(&set, SIGBUS);
1105         pthread_sigmask(SIG_UNBLOCK, &set, NULL);
1106     }
1107     perror("Failed to re-raise SIGBUS!\n");
1108     abort();
1109 }
1110
1111 static void sigbus_handler(int n, siginfo_t *siginfo, void *ctx)
1112 {
1113     if (siginfo->si_code != BUS_MCEERR_AO && siginfo->si_code != BUS_MCEERR_AR) {
1114         sigbus_reraise();
1115     }
1116
1117     if (current_cpu) {
1118         /* Called asynchronously in VCPU thread.  */
1119         if (kvm_on_sigbus_vcpu(current_cpu, siginfo->si_code, siginfo->si_addr)) {
1120             sigbus_reraise();
1121         }
1122     } else {
1123         /* Called synchronously (via signalfd) in main thread.  */
1124         if (kvm_on_sigbus(siginfo->si_code, siginfo->si_addr)) {
1125             sigbus_reraise();
1126         }
1127     }
1128 }
1129
1130 static void qemu_init_sigbus(void)
1131 {
1132     struct sigaction action;
1133
1134     memset(&action, 0, sizeof(action));
1135     action.sa_flags = SA_SIGINFO;
1136     action.sa_sigaction = sigbus_handler;
1137     sigaction(SIGBUS, &action, NULL);
1138
1139     prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0);
1140 }
1141 #else /* !CONFIG_LINUX */
1142 static void qemu_init_sigbus(void)
1143 {
1144 }
1145 #endif /* !CONFIG_LINUX */
1146
1147 static QemuMutex qemu_global_mutex;
1148
1149 static QemuThread io_thread;
1150
1151 /* cpu creation */
1152 static QemuCond qemu_cpu_cond;
1153 /* system init */
1154 static QemuCond qemu_pause_cond;
1155
1156 void qemu_init_cpu_loop(void)
1157 {
1158     qemu_init_sigbus();
1159     qemu_cond_init(&qemu_cpu_cond);
1160     qemu_cond_init(&qemu_pause_cond);
1161     qemu_mutex_init(&qemu_global_mutex);
1162
1163     qemu_thread_get_self(&io_thread);
1164 }
1165
1166 void run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data)
1167 {
1168     do_run_on_cpu(cpu, func, data, &qemu_global_mutex);
1169 }
1170
1171 static void qemu_kvm_destroy_vcpu(CPUState *cpu)
1172 {
1173     if (kvm_destroy_vcpu(cpu) < 0) {
1174         error_report("kvm_destroy_vcpu failed");
1175         exit(EXIT_FAILURE);
1176     }
1177 }
1178
1179 static void qemu_tcg_destroy_vcpu(CPUState *cpu)
1180 {
1181 }
1182
1183 static void qemu_cpu_stop(CPUState *cpu, bool exit)
1184 {
1185     g_assert(qemu_cpu_is_self(cpu));
1186     cpu->stop = false;
1187     cpu->stopped = true;
1188     if (exit) {
1189         cpu_exit(cpu);
1190     }
1191     qemu_cond_broadcast(&qemu_pause_cond);
1192 }
1193
1194 static void qemu_wait_io_event_common(CPUState *cpu)
1195 {
1196     atomic_mb_set(&cpu->thread_kicked, false);
1197     if (cpu->stop) {
1198         qemu_cpu_stop(cpu, false);
1199     }
1200     process_queued_cpu_work(cpu);
1201 }
1202
1203 static void qemu_tcg_rr_wait_io_event(CPUState *cpu)
1204 {
1205     while (all_cpu_threads_idle()) {
1206         stop_tcg_kick_timer();
1207         qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1208     }
1209
1210     start_tcg_kick_timer();
1211
1212     qemu_wait_io_event_common(cpu);
1213 }
1214
1215 static void qemu_wait_io_event(CPUState *cpu)
1216 {
1217     while (cpu_thread_is_idle(cpu)) {
1218         qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1219     }
1220
1221 #ifdef _WIN32
1222     /* Eat dummy APC queued by qemu_cpu_kick_thread.  */
1223     if (!tcg_enabled()) {
1224         SleepEx(0, TRUE);
1225     }
1226 #endif
1227     qemu_wait_io_event_common(cpu);
1228 }
1229
1230 static void *qemu_kvm_cpu_thread_fn(void *arg)
1231 {
1232     CPUState *cpu = arg;
1233     int r;
1234
1235     rcu_register_thread();
1236
1237     qemu_mutex_lock_iothread();
1238     qemu_thread_get_self(cpu->thread);
1239     cpu->thread_id = qemu_get_thread_id();
1240     cpu->can_do_io = 1;
1241     current_cpu = cpu;
1242
1243     r = kvm_init_vcpu(cpu);
1244     if (r < 0) {
1245         error_report("kvm_init_vcpu failed: %s", strerror(-r));
1246         exit(1);
1247     }
1248
1249     kvm_init_cpu_signals(cpu);
1250
1251     /* signal CPU creation */
1252     cpu->created = true;
1253     qemu_cond_signal(&qemu_cpu_cond);
1254
1255     do {
1256         if (cpu_can_run(cpu)) {
1257             r = kvm_cpu_exec(cpu);
1258             if (r == EXCP_DEBUG) {
1259                 cpu_handle_guest_debug(cpu);
1260             }
1261         }
1262         qemu_wait_io_event(cpu);
1263     } while (!cpu->unplug || cpu_can_run(cpu));
1264
1265     qemu_kvm_destroy_vcpu(cpu);
1266     cpu->created = false;
1267     qemu_cond_signal(&qemu_cpu_cond);
1268     qemu_mutex_unlock_iothread();
1269     rcu_unregister_thread();
1270     return NULL;
1271 }
1272
1273 static void *qemu_dummy_cpu_thread_fn(void *arg)
1274 {
1275 #ifdef _WIN32
1276     error_report("qtest is not supported under Windows");
1277     exit(1);
1278 #else
1279     CPUState *cpu = arg;
1280     sigset_t waitset;
1281     int r;
1282
1283     rcu_register_thread();
1284
1285     qemu_mutex_lock_iothread();
1286     qemu_thread_get_self(cpu->thread);
1287     cpu->thread_id = qemu_get_thread_id();
1288     cpu->can_do_io = 1;
1289     current_cpu = cpu;
1290
1291     sigemptyset(&waitset);
1292     sigaddset(&waitset, SIG_IPI);
1293
1294     /* signal CPU creation */
1295     cpu->created = true;
1296     qemu_cond_signal(&qemu_cpu_cond);
1297
1298     do {
1299         qemu_mutex_unlock_iothread();
1300         do {
1301             int sig;
1302             r = sigwait(&waitset, &sig);
1303         } while (r == -1 && (errno == EAGAIN || errno == EINTR));
1304         if (r == -1) {
1305             perror("sigwait");
1306             exit(1);
1307         }
1308         qemu_mutex_lock_iothread();
1309         qemu_wait_io_event(cpu);
1310     } while (!cpu->unplug);
1311
1312     rcu_unregister_thread();
1313     return NULL;
1314 #endif
1315 }
1316
1317 static int64_t tcg_get_icount_limit(void)
1318 {
1319     int64_t deadline;
1320
1321     if (replay_mode != REPLAY_MODE_PLAY) {
1322         deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1323
1324         /* Maintain prior (possibly buggy) behaviour where if no deadline
1325          * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
1326          * INT32_MAX nanoseconds ahead, we still use INT32_MAX
1327          * nanoseconds.
1328          */
1329         if ((deadline < 0) || (deadline > INT32_MAX)) {
1330             deadline = INT32_MAX;
1331         }
1332
1333         return qemu_icount_round(deadline);
1334     } else {
1335         return replay_get_instructions();
1336     }
1337 }
1338
1339 static void handle_icount_deadline(void)
1340 {
1341     assert(qemu_in_vcpu_thread());
1342     if (use_icount) {
1343         int64_t deadline =
1344             qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1345
1346         if (deadline == 0) {
1347             /* Wake up other AioContexts.  */
1348             qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
1349             qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
1350         }
1351     }
1352 }
1353
1354 static void prepare_icount_for_run(CPUState *cpu)
1355 {
1356     if (use_icount) {
1357         int insns_left;
1358
1359         /* These should always be cleared by process_icount_data after
1360          * each vCPU execution. However u16.high can be raised
1361          * asynchronously by cpu_exit/cpu_interrupt/tcg_handle_interrupt
1362          */
1363         g_assert(cpu->icount_decr.u16.low == 0);
1364         g_assert(cpu->icount_extra == 0);
1365
1366         cpu->icount_budget = tcg_get_icount_limit();
1367         insns_left = MIN(0xffff, cpu->icount_budget);
1368         cpu->icount_decr.u16.low = insns_left;
1369         cpu->icount_extra = cpu->icount_budget - insns_left;
1370
1371         replay_mutex_lock();
1372     }
1373 }
1374
1375 static void process_icount_data(CPUState *cpu)
1376 {
1377     if (use_icount) {
1378         /* Account for executed instructions */
1379         cpu_update_icount(cpu);
1380
1381         /* Reset the counters */
1382         cpu->icount_decr.u16.low = 0;
1383         cpu->icount_extra = 0;
1384         cpu->icount_budget = 0;
1385
1386         replay_account_executed_instructions();
1387
1388         replay_mutex_unlock();
1389     }
1390 }
1391
1392
1393 static int tcg_cpu_exec(CPUState *cpu)
1394 {
1395     int ret;
1396 #ifdef CONFIG_PROFILER
1397     int64_t ti;
1398 #endif
1399
1400     assert(tcg_enabled());
1401 #ifdef CONFIG_PROFILER
1402     ti = profile_getclock();
1403 #endif
1404     cpu_exec_start(cpu);
1405     ret = cpu_exec(cpu);
1406     cpu_exec_end(cpu);
1407 #ifdef CONFIG_PROFILER
1408     tcg_time += profile_getclock() - ti;
1409 #endif
1410     return ret;
1411 }
1412
1413 /* Destroy any remaining vCPUs which have been unplugged and have
1414  * finished running
1415  */
1416 static void deal_with_unplugged_cpus(void)
1417 {
1418     CPUState *cpu;
1419
1420     CPU_FOREACH(cpu) {
1421         if (cpu->unplug && !cpu_can_run(cpu)) {
1422             qemu_tcg_destroy_vcpu(cpu);
1423             cpu->created = false;
1424             qemu_cond_signal(&qemu_cpu_cond);
1425             break;
1426         }
1427     }
1428 }
1429
1430 /* Single-threaded TCG
1431  *
1432  * In the single-threaded case each vCPU is simulated in turn. If
1433  * there is more than a single vCPU we create a simple timer to kick
1434  * the vCPU and ensure we don't get stuck in a tight loop in one vCPU.
1435  * This is done explicitly rather than relying on side-effects
1436  * elsewhere.
1437  */
1438
1439 static void *qemu_tcg_rr_cpu_thread_fn(void *arg)
1440 {
1441     CPUState *cpu = arg;
1442
1443     assert(tcg_enabled());
1444     rcu_register_thread();
1445     tcg_register_thread();
1446
1447     qemu_mutex_lock_iothread();
1448     qemu_thread_get_self(cpu->thread);
1449
1450     cpu->thread_id = qemu_get_thread_id();
1451     cpu->created = true;
1452     cpu->can_do_io = 1;
1453     qemu_cond_signal(&qemu_cpu_cond);
1454
1455     /* wait for initial kick-off after machine start */
1456     while (first_cpu->stopped) {
1457         qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1458
1459         /* process any pending work */
1460         CPU_FOREACH(cpu) {
1461             current_cpu = cpu;
1462             qemu_wait_io_event_common(cpu);
1463         }
1464     }
1465
1466     start_tcg_kick_timer();
1467
1468     cpu = first_cpu;
1469
1470     /* process any pending work */
1471     cpu->exit_request = 1;
1472
1473     while (1) {
1474         qemu_mutex_unlock_iothread();
1475         replay_mutex_lock();
1476         qemu_mutex_lock_iothread();
1477         /* Account partial waits to QEMU_CLOCK_VIRTUAL.  */
1478         qemu_account_warp_timer();
1479
1480         /* Run the timers here.  This is much more efficient than
1481          * waking up the I/O thread and waiting for completion.
1482          */
1483         handle_icount_deadline();
1484
1485         replay_mutex_unlock();
1486
1487         if (!cpu) {
1488             cpu = first_cpu;
1489         }
1490
1491         while (cpu && !cpu->queued_work_first && !cpu->exit_request) {
1492
1493             atomic_mb_set(&tcg_current_rr_cpu, cpu);
1494             current_cpu = cpu;
1495
1496             qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
1497                               (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
1498
1499             if (cpu_can_run(cpu)) {
1500                 int r;
1501
1502                 qemu_mutex_unlock_iothread();
1503                 prepare_icount_for_run(cpu);
1504
1505                 r = tcg_cpu_exec(cpu);
1506
1507                 process_icount_data(cpu);
1508                 qemu_mutex_lock_iothread();
1509
1510                 if (r == EXCP_DEBUG) {
1511                     cpu_handle_guest_debug(cpu);
1512                     break;
1513                 } else if (r == EXCP_ATOMIC) {
1514                     qemu_mutex_unlock_iothread();
1515                     cpu_exec_step_atomic(cpu);
1516                     qemu_mutex_lock_iothread();
1517                     break;
1518                 }
1519             } else if (cpu->stop) {
1520                 if (cpu->unplug) {
1521                     cpu = CPU_NEXT(cpu);
1522                 }
1523                 break;
1524             }
1525
1526             cpu = CPU_NEXT(cpu);
1527         } /* while (cpu && !cpu->exit_request).. */
1528
1529         /* Does not need atomic_mb_set because a spurious wakeup is okay.  */
1530         atomic_set(&tcg_current_rr_cpu, NULL);
1531
1532         if (cpu && cpu->exit_request) {
1533             atomic_mb_set(&cpu->exit_request, 0);
1534         }
1535
1536         qemu_tcg_rr_wait_io_event(cpu ? cpu : first_cpu);
1537         deal_with_unplugged_cpus();
1538     }
1539
1540     rcu_unregister_thread();
1541     return NULL;
1542 }
1543
1544 static void *qemu_hax_cpu_thread_fn(void *arg)
1545 {
1546     CPUState *cpu = arg;
1547     int r;
1548
1549     rcu_register_thread();
1550     qemu_mutex_lock_iothread();
1551     qemu_thread_get_self(cpu->thread);
1552
1553     cpu->thread_id = qemu_get_thread_id();
1554     cpu->created = true;
1555     cpu->halted = 0;
1556     current_cpu = cpu;
1557
1558     hax_init_vcpu(cpu);
1559     qemu_cond_signal(&qemu_cpu_cond);
1560
1561     do {
1562         if (cpu_can_run(cpu)) {
1563             r = hax_smp_cpu_exec(cpu);
1564             if (r == EXCP_DEBUG) {
1565                 cpu_handle_guest_debug(cpu);
1566             }
1567         }
1568
1569         qemu_wait_io_event(cpu);
1570     } while (!cpu->unplug || cpu_can_run(cpu));
1571     rcu_unregister_thread();
1572     return NULL;
1573 }
1574
1575 /* The HVF-specific vCPU thread function. This one should only run when the host
1576  * CPU supports the VMX "unrestricted guest" feature. */
1577 static void *qemu_hvf_cpu_thread_fn(void *arg)
1578 {
1579     CPUState *cpu = arg;
1580
1581     int r;
1582
1583     assert(hvf_enabled());
1584
1585     rcu_register_thread();
1586
1587     qemu_mutex_lock_iothread();
1588     qemu_thread_get_self(cpu->thread);
1589
1590     cpu->thread_id = qemu_get_thread_id();
1591     cpu->can_do_io = 1;
1592     current_cpu = cpu;
1593
1594     hvf_init_vcpu(cpu);
1595
1596     /* signal CPU creation */
1597     cpu->created = true;
1598     qemu_cond_signal(&qemu_cpu_cond);
1599
1600     do {
1601         if (cpu_can_run(cpu)) {
1602             r = hvf_vcpu_exec(cpu);
1603             if (r == EXCP_DEBUG) {
1604                 cpu_handle_guest_debug(cpu);
1605             }
1606         }
1607         qemu_wait_io_event(cpu);
1608     } while (!cpu->unplug || cpu_can_run(cpu));
1609
1610     hvf_vcpu_destroy(cpu);
1611     cpu->created = false;
1612     qemu_cond_signal(&qemu_cpu_cond);
1613     qemu_mutex_unlock_iothread();
1614     rcu_unregister_thread();
1615     return NULL;
1616 }
1617
1618 static void *qemu_whpx_cpu_thread_fn(void *arg)
1619 {
1620     CPUState *cpu = arg;
1621     int r;
1622
1623     rcu_register_thread();
1624
1625     qemu_mutex_lock_iothread();
1626     qemu_thread_get_self(cpu->thread);
1627     cpu->thread_id = qemu_get_thread_id();
1628     current_cpu = cpu;
1629
1630     r = whpx_init_vcpu(cpu);
1631     if (r < 0) {
1632         fprintf(stderr, "whpx_init_vcpu failed: %s\n", strerror(-r));
1633         exit(1);
1634     }
1635
1636     /* signal CPU creation */
1637     cpu->created = true;
1638     qemu_cond_signal(&qemu_cpu_cond);
1639
1640     do {
1641         if (cpu_can_run(cpu)) {
1642             r = whpx_vcpu_exec(cpu);
1643             if (r == EXCP_DEBUG) {
1644                 cpu_handle_guest_debug(cpu);
1645             }
1646         }
1647         while (cpu_thread_is_idle(cpu)) {
1648             qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1649         }
1650         qemu_wait_io_event_common(cpu);
1651     } while (!cpu->unplug || cpu_can_run(cpu));
1652
1653     whpx_destroy_vcpu(cpu);
1654     cpu->created = false;
1655     qemu_cond_signal(&qemu_cpu_cond);
1656     qemu_mutex_unlock_iothread();
1657     rcu_unregister_thread();
1658     return NULL;
1659 }
1660
1661 #ifdef _WIN32
1662 static void CALLBACK dummy_apc_func(ULONG_PTR unused)
1663 {
1664 }
1665 #endif
1666
1667 /* Multi-threaded TCG
1668  *
1669  * In the multi-threaded case each vCPU has its own thread. The TLS
1670  * variable current_cpu can be used deep in the code to find the
1671  * current CPUState for a given thread.
1672  */
1673
1674 static void *qemu_tcg_cpu_thread_fn(void *arg)
1675 {
1676     CPUState *cpu = arg;
1677
1678     assert(tcg_enabled());
1679     g_assert(!use_icount);
1680
1681     rcu_register_thread();
1682     tcg_register_thread();
1683
1684     qemu_mutex_lock_iothread();
1685     qemu_thread_get_self(cpu->thread);
1686
1687     cpu->thread_id = qemu_get_thread_id();
1688     cpu->created = true;
1689     cpu->can_do_io = 1;
1690     current_cpu = cpu;
1691     qemu_cond_signal(&qemu_cpu_cond);
1692
1693     /* process any pending work */
1694     cpu->exit_request = 1;
1695
1696     do {
1697         if (cpu_can_run(cpu)) {
1698             int r;
1699             qemu_mutex_unlock_iothread();
1700             r = tcg_cpu_exec(cpu);
1701             qemu_mutex_lock_iothread();
1702             switch (r) {
1703             case EXCP_DEBUG:
1704                 cpu_handle_guest_debug(cpu);
1705                 break;
1706             case EXCP_HALTED:
1707                 /* during start-up the vCPU is reset and the thread is
1708                  * kicked several times. If we don't ensure we go back
1709                  * to sleep in the halted state we won't cleanly
1710                  * start-up when the vCPU is enabled.
1711                  *
1712                  * cpu->halted should ensure we sleep in wait_io_event
1713                  */
1714                 g_assert(cpu->halted);
1715                 break;
1716             case EXCP_ATOMIC:
1717                 qemu_mutex_unlock_iothread();
1718                 cpu_exec_step_atomic(cpu);
1719                 qemu_mutex_lock_iothread();
1720             default:
1721                 /* Ignore everything else? */
1722                 break;
1723             }
1724         }
1725
1726         atomic_mb_set(&cpu->exit_request, 0);
1727         qemu_wait_io_event(cpu);
1728     } while (!cpu->unplug || cpu_can_run(cpu));
1729
1730     qemu_tcg_destroy_vcpu(cpu);
1731     cpu->created = false;
1732     qemu_cond_signal(&qemu_cpu_cond);
1733     qemu_mutex_unlock_iothread();
1734     rcu_unregister_thread();
1735     return NULL;
1736 }
1737
1738 static void qemu_cpu_kick_thread(CPUState *cpu)
1739 {
1740 #ifndef _WIN32
1741     int err;
1742
1743     if (cpu->thread_kicked) {
1744         return;
1745     }
1746     cpu->thread_kicked = true;
1747     err = pthread_kill(cpu->thread->thread, SIG_IPI);
1748     if (err) {
1749         fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
1750         exit(1);
1751     }
1752 #else /* _WIN32 */
1753     if (!qemu_cpu_is_self(cpu)) {
1754         if (whpx_enabled()) {
1755             whpx_vcpu_kick(cpu);
1756         } else if (!QueueUserAPC(dummy_apc_func, cpu->hThread, 0)) {
1757             fprintf(stderr, "%s: QueueUserAPC failed with error %lu\n",
1758                     __func__, GetLastError());
1759             exit(1);
1760         }
1761     }
1762 #endif
1763 }
1764
1765 void qemu_cpu_kick(CPUState *cpu)
1766 {
1767     qemu_cond_broadcast(cpu->halt_cond);
1768     if (tcg_enabled()) {
1769         cpu_exit(cpu);
1770         /* NOP unless doing single-thread RR */
1771         qemu_cpu_kick_rr_cpu();
1772     } else {
1773         if (hax_enabled()) {
1774             /*
1775              * FIXME: race condition with the exit_request check in
1776              * hax_vcpu_hax_exec
1777              */
1778             cpu->exit_request = 1;
1779         }
1780         qemu_cpu_kick_thread(cpu);
1781     }
1782 }
1783
1784 void qemu_cpu_kick_self(void)
1785 {
1786     assert(current_cpu);
1787     qemu_cpu_kick_thread(current_cpu);
1788 }
1789
1790 bool qemu_cpu_is_self(CPUState *cpu)
1791 {
1792     return qemu_thread_is_self(cpu->thread);
1793 }
1794
1795 bool qemu_in_vcpu_thread(void)
1796 {
1797     return current_cpu && qemu_cpu_is_self(current_cpu);
1798 }
1799
1800 static __thread bool iothread_locked = false;
1801
1802 bool qemu_mutex_iothread_locked(void)
1803 {
1804     return iothread_locked;
1805 }
1806
1807 /*
1808  * The BQL is taken from so many places that it is worth profiling the
1809  * callers directly, instead of funneling them all through a single function.
1810  */
1811 void qemu_mutex_lock_iothread_impl(const char *file, int line)
1812 {
1813     QemuMutexLockFunc bql_lock = atomic_read(&qemu_bql_mutex_lock_func);
1814
1815     g_assert(!qemu_mutex_iothread_locked());
1816     bql_lock(&qemu_global_mutex, file, line);
1817     iothread_locked = true;
1818 }
1819
1820 void qemu_mutex_unlock_iothread(void)
1821 {
1822     g_assert(qemu_mutex_iothread_locked());
1823     iothread_locked = false;
1824     qemu_mutex_unlock(&qemu_global_mutex);
1825 }
1826
1827 static bool all_vcpus_paused(void)
1828 {
1829     CPUState *cpu;
1830
1831     CPU_FOREACH(cpu) {
1832         if (!cpu->stopped) {
1833             return false;
1834         }
1835     }
1836
1837     return true;
1838 }
1839
1840 void pause_all_vcpus(void)
1841 {
1842     CPUState *cpu;
1843
1844     qemu_clock_enable(QEMU_CLOCK_VIRTUAL, false);
1845     CPU_FOREACH(cpu) {
1846         if (qemu_cpu_is_self(cpu)) {
1847             qemu_cpu_stop(cpu, true);
1848         } else {
1849             cpu->stop = true;
1850             qemu_cpu_kick(cpu);
1851         }
1852     }
1853
1854     /* We need to drop the replay_lock so any vCPU threads woken up
1855      * can finish their replay tasks
1856      */
1857     replay_mutex_unlock();
1858
1859     while (!all_vcpus_paused()) {
1860         qemu_cond_wait(&qemu_pause_cond, &qemu_global_mutex);
1861         CPU_FOREACH(cpu) {
1862             qemu_cpu_kick(cpu);
1863         }
1864     }
1865
1866     qemu_mutex_unlock_iothread();
1867     replay_mutex_lock();
1868     qemu_mutex_lock_iothread();
1869 }
1870
1871 void cpu_resume(CPUState *cpu)
1872 {
1873     cpu->stop = false;
1874     cpu->stopped = false;
1875     qemu_cpu_kick(cpu);
1876 }
1877
1878 void resume_all_vcpus(void)
1879 {
1880     CPUState *cpu;
1881
1882     qemu_clock_enable(QEMU_CLOCK_VIRTUAL, true);
1883     CPU_FOREACH(cpu) {
1884         cpu_resume(cpu);
1885     }
1886 }
1887
1888 void cpu_remove_sync(CPUState *cpu)
1889 {
1890     cpu->stop = true;
1891     cpu->unplug = true;
1892     qemu_cpu_kick(cpu);
1893     qemu_mutex_unlock_iothread();
1894     qemu_thread_join(cpu->thread);
1895     qemu_mutex_lock_iothread();
1896 }
1897
1898 /* For temporary buffers for forming a name */
1899 #define VCPU_THREAD_NAME_SIZE 16
1900
1901 static void qemu_tcg_init_vcpu(CPUState *cpu)
1902 {
1903     char thread_name[VCPU_THREAD_NAME_SIZE];
1904     static QemuCond *single_tcg_halt_cond;
1905     static QemuThread *single_tcg_cpu_thread;
1906     static int tcg_region_inited;
1907
1908     assert(tcg_enabled());
1909     /*
1910      * Initialize TCG regions--once. Now is a good time, because:
1911      * (1) TCG's init context, prologue and target globals have been set up.
1912      * (2) qemu_tcg_mttcg_enabled() works now (TCG init code runs before the
1913      *     -accel flag is processed, so the check doesn't work then).
1914      */
1915     if (!tcg_region_inited) {
1916         tcg_region_inited = 1;
1917         tcg_region_init();
1918     }
1919
1920     if (qemu_tcg_mttcg_enabled() || !single_tcg_cpu_thread) {
1921         cpu->thread = g_malloc0(sizeof(QemuThread));
1922         cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1923         qemu_cond_init(cpu->halt_cond);
1924
1925         if (qemu_tcg_mttcg_enabled()) {
1926             /* create a thread per vCPU with TCG (MTTCG) */
1927             parallel_cpus = true;
1928             snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
1929                  cpu->cpu_index);
1930
1931             qemu_thread_create(cpu->thread, thread_name, qemu_tcg_cpu_thread_fn,
1932                                cpu, QEMU_THREAD_JOINABLE);
1933
1934         } else {
1935             /* share a single thread for all cpus with TCG */
1936             snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "ALL CPUs/TCG");
1937             qemu_thread_create(cpu->thread, thread_name,
1938                                qemu_tcg_rr_cpu_thread_fn,
1939                                cpu, QEMU_THREAD_JOINABLE);
1940
1941             single_tcg_halt_cond = cpu->halt_cond;
1942             single_tcg_cpu_thread = cpu->thread;
1943         }
1944 #ifdef _WIN32
1945         cpu->hThread = qemu_thread_get_handle(cpu->thread);
1946 #endif
1947     } else {
1948         /* For non-MTTCG cases we share the thread */
1949         cpu->thread = single_tcg_cpu_thread;
1950         cpu->halt_cond = single_tcg_halt_cond;
1951         cpu->thread_id = first_cpu->thread_id;
1952         cpu->can_do_io = 1;
1953         cpu->created = true;
1954     }
1955 }
1956
1957 static void qemu_hax_start_vcpu(CPUState *cpu)
1958 {
1959     char thread_name[VCPU_THREAD_NAME_SIZE];
1960
1961     cpu->thread = g_malloc0(sizeof(QemuThread));
1962     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1963     qemu_cond_init(cpu->halt_cond);
1964
1965     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HAX",
1966              cpu->cpu_index);
1967     qemu_thread_create(cpu->thread, thread_name, qemu_hax_cpu_thread_fn,
1968                        cpu, QEMU_THREAD_JOINABLE);
1969 #ifdef _WIN32
1970     cpu->hThread = qemu_thread_get_handle(cpu->thread);
1971 #endif
1972 }
1973
1974 static void qemu_kvm_start_vcpu(CPUState *cpu)
1975 {
1976     char thread_name[VCPU_THREAD_NAME_SIZE];
1977
1978     cpu->thread = g_malloc0(sizeof(QemuThread));
1979     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1980     qemu_cond_init(cpu->halt_cond);
1981     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/KVM",
1982              cpu->cpu_index);
1983     qemu_thread_create(cpu->thread, thread_name, qemu_kvm_cpu_thread_fn,
1984                        cpu, QEMU_THREAD_JOINABLE);
1985 }
1986
1987 static void qemu_hvf_start_vcpu(CPUState *cpu)
1988 {
1989     char thread_name[VCPU_THREAD_NAME_SIZE];
1990
1991     /* HVF currently does not support TCG, and only runs in
1992      * unrestricted-guest mode. */
1993     assert(hvf_enabled());
1994
1995     cpu->thread = g_malloc0(sizeof(QemuThread));
1996     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1997     qemu_cond_init(cpu->halt_cond);
1998
1999     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HVF",
2000              cpu->cpu_index);
2001     qemu_thread_create(cpu->thread, thread_name, qemu_hvf_cpu_thread_fn,
2002                        cpu, QEMU_THREAD_JOINABLE);
2003 }
2004
2005 static void qemu_whpx_start_vcpu(CPUState *cpu)
2006 {
2007     char thread_name[VCPU_THREAD_NAME_SIZE];
2008
2009     cpu->thread = g_malloc0(sizeof(QemuThread));
2010     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2011     qemu_cond_init(cpu->halt_cond);
2012     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/WHPX",
2013              cpu->cpu_index);
2014     qemu_thread_create(cpu->thread, thread_name, qemu_whpx_cpu_thread_fn,
2015                        cpu, QEMU_THREAD_JOINABLE);
2016 #ifdef _WIN32
2017     cpu->hThread = qemu_thread_get_handle(cpu->thread);
2018 #endif
2019 }
2020
2021 static void qemu_dummy_start_vcpu(CPUState *cpu)
2022 {
2023     char thread_name[VCPU_THREAD_NAME_SIZE];
2024
2025     cpu->thread = g_malloc0(sizeof(QemuThread));
2026     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2027     qemu_cond_init(cpu->halt_cond);
2028     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/DUMMY",
2029              cpu->cpu_index);
2030     qemu_thread_create(cpu->thread, thread_name, qemu_dummy_cpu_thread_fn, cpu,
2031                        QEMU_THREAD_JOINABLE);
2032 }
2033
2034 void qemu_init_vcpu(CPUState *cpu)
2035 {
2036     cpu->nr_cores = smp_cores;
2037     cpu->nr_threads = smp_threads;
2038     cpu->stopped = true;
2039
2040     if (!cpu->as) {
2041         /* If the target cpu hasn't set up any address spaces itself,
2042          * give it the default one.
2043          */
2044         cpu->num_ases = 1;
2045         cpu_address_space_init(cpu, 0, "cpu-memory", cpu->memory);
2046     }
2047
2048     if (kvm_enabled()) {
2049         qemu_kvm_start_vcpu(cpu);
2050     } else if (hax_enabled()) {
2051         qemu_hax_start_vcpu(cpu);
2052     } else if (hvf_enabled()) {
2053         qemu_hvf_start_vcpu(cpu);
2054     } else if (tcg_enabled()) {
2055         qemu_tcg_init_vcpu(cpu);
2056     } else if (whpx_enabled()) {
2057         qemu_whpx_start_vcpu(cpu);
2058     } else {
2059         qemu_dummy_start_vcpu(cpu);
2060     }
2061
2062     while (!cpu->created) {
2063         qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
2064     }
2065 }
2066
2067 void cpu_stop_current(void)
2068 {
2069     if (current_cpu) {
2070         qemu_cpu_stop(current_cpu, true);
2071     }
2072 }
2073
2074 int vm_stop(RunState state)
2075 {
2076     if (qemu_in_vcpu_thread()) {
2077         qemu_system_vmstop_request_prepare();
2078         qemu_system_vmstop_request(state);
2079         /*
2080          * FIXME: should not return to device code in case
2081          * vm_stop() has been requested.
2082          */
2083         cpu_stop_current();
2084         return 0;
2085     }
2086
2087     return do_vm_stop(state, true);
2088 }
2089
2090 /**
2091  * Prepare for (re)starting the VM.
2092  * Returns -1 if the vCPUs are not to be restarted (e.g. if they are already
2093  * running or in case of an error condition), 0 otherwise.
2094  */
2095 int vm_prepare_start(void)
2096 {
2097     RunState requested;
2098
2099     qemu_vmstop_requested(&requested);
2100     if (runstate_is_running() && requested == RUN_STATE__MAX) {
2101         return -1;
2102     }
2103
2104     /* Ensure that a STOP/RESUME pair of events is emitted if a
2105      * vmstop request was pending.  The BLOCK_IO_ERROR event, for
2106      * example, according to documentation is always followed by
2107      * the STOP event.
2108      */
2109     if (runstate_is_running()) {
2110         qapi_event_send_stop(&error_abort);
2111         qapi_event_send_resume(&error_abort);
2112         return -1;
2113     }
2114
2115     /* We are sending this now, but the CPUs will be resumed shortly later */
2116     qapi_event_send_resume(&error_abort);
2117
2118     replay_enable_events();
2119     cpu_enable_ticks();
2120     runstate_set(RUN_STATE_RUNNING);
2121     vm_state_notify(1, RUN_STATE_RUNNING);
2122     return 0;
2123 }
2124
2125 void vm_start(void)
2126 {
2127     if (!vm_prepare_start()) {
2128         resume_all_vcpus();
2129     }
2130 }
2131
2132 /* does a state transition even if the VM is already stopped,
2133    current state is forgotten forever */
2134 int vm_stop_force_state(RunState state)
2135 {
2136     if (runstate_is_running()) {
2137         return vm_stop(state);
2138     } else {
2139         runstate_set(state);
2140
2141         bdrv_drain_all();
2142         /* Make sure to return an error if the flush in a previous vm_stop()
2143          * failed. */
2144         return bdrv_flush_all();
2145     }
2146 }
2147
2148 void list_cpus(FILE *f, fprintf_function cpu_fprintf, const char *optarg)
2149 {
2150     /* XXX: implement xxx_cpu_list for targets that still miss it */
2151 #if defined(cpu_list)
2152     cpu_list(f, cpu_fprintf);
2153 #endif
2154 }
2155
2156 CpuInfoList *qmp_query_cpus(Error **errp)
2157 {
2158     MachineState *ms = MACHINE(qdev_get_machine());
2159     MachineClass *mc = MACHINE_GET_CLASS(ms);
2160     CpuInfoList *head = NULL, *cur_item = NULL;
2161     CPUState *cpu;
2162
2163     CPU_FOREACH(cpu) {
2164         CpuInfoList *info;
2165 #if defined(TARGET_I386)
2166         X86CPU *x86_cpu = X86_CPU(cpu);
2167         CPUX86State *env = &x86_cpu->env;
2168 #elif defined(TARGET_PPC)
2169         PowerPCCPU *ppc_cpu = POWERPC_CPU(cpu);
2170         CPUPPCState *env = &ppc_cpu->env;
2171 #elif defined(TARGET_SPARC)
2172         SPARCCPU *sparc_cpu = SPARC_CPU(cpu);
2173         CPUSPARCState *env = &sparc_cpu->env;
2174 #elif defined(TARGET_RISCV)
2175         RISCVCPU *riscv_cpu = RISCV_CPU(cpu);
2176         CPURISCVState *env = &riscv_cpu->env;
2177 #elif defined(TARGET_MIPS)
2178         MIPSCPU *mips_cpu = MIPS_CPU(cpu);
2179         CPUMIPSState *env = &mips_cpu->env;
2180 #elif defined(TARGET_TRICORE)
2181         TriCoreCPU *tricore_cpu = TRICORE_CPU(cpu);
2182         CPUTriCoreState *env = &tricore_cpu->env;
2183 #elif defined(TARGET_S390X)
2184         S390CPU *s390_cpu = S390_CPU(cpu);
2185         CPUS390XState *env = &s390_cpu->env;
2186 #endif
2187
2188         cpu_synchronize_state(cpu);
2189
2190         info = g_malloc0(sizeof(*info));
2191         info->value = g_malloc0(sizeof(*info->value));
2192         info->value->CPU = cpu->cpu_index;
2193         info->value->current = (cpu == first_cpu);
2194         info->value->halted = cpu->halted;
2195         info->value->qom_path = object_get_canonical_path(OBJECT(cpu));
2196         info->value->thread_id = cpu->thread_id;
2197 #if defined(TARGET_I386)
2198         info->value->arch = CPU_INFO_ARCH_X86;
2199         info->value->u.x86.pc = env->eip + env->segs[R_CS].base;
2200 #elif defined(TARGET_PPC)
2201         info->value->arch = CPU_INFO_ARCH_PPC;
2202         info->value->u.ppc.nip = env->nip;
2203 #elif defined(TARGET_SPARC)
2204         info->value->arch = CPU_INFO_ARCH_SPARC;
2205         info->value->u.q_sparc.pc = env->pc;
2206         info->value->u.q_sparc.npc = env->npc;
2207 #elif defined(TARGET_MIPS)
2208         info->value->arch = CPU_INFO_ARCH_MIPS;
2209         info->value->u.q_mips.PC = env->active_tc.PC;
2210 #elif defined(TARGET_TRICORE)
2211         info->value->arch = CPU_INFO_ARCH_TRICORE;
2212         info->value->u.tricore.PC = env->PC;
2213 #elif defined(TARGET_S390X)
2214         info->value->arch = CPU_INFO_ARCH_S390;
2215         info->value->u.s390.cpu_state = env->cpu_state;
2216 #elif defined(TARGET_RISCV)
2217         info->value->arch = CPU_INFO_ARCH_RISCV;
2218         info->value->u.riscv.pc = env->pc;
2219 #else
2220         info->value->arch = CPU_INFO_ARCH_OTHER;
2221 #endif
2222         info->value->has_props = !!mc->cpu_index_to_instance_props;
2223         if (info->value->has_props) {
2224             CpuInstanceProperties *props;
2225             props = g_malloc0(sizeof(*props));
2226             *props = mc->cpu_index_to_instance_props(ms, cpu->cpu_index);
2227             info->value->props = props;
2228         }
2229
2230         /* XXX: waiting for the qapi to support GSList */
2231         if (!cur_item) {
2232             head = cur_item = info;
2233         } else {
2234             cur_item->next = info;
2235             cur_item = info;
2236         }
2237     }
2238
2239     return head;
2240 }
2241
2242 static CpuInfoArch sysemu_target_to_cpuinfo_arch(SysEmuTarget target)
2243 {
2244     /*
2245      * The @SysEmuTarget -> @CpuInfoArch mapping below is based on the
2246      * TARGET_ARCH -> TARGET_BASE_ARCH mapping in the "configure" script.
2247      */
2248     switch (target) {
2249     case SYS_EMU_TARGET_I386:
2250     case SYS_EMU_TARGET_X86_64:
2251         return CPU_INFO_ARCH_X86;
2252
2253     case SYS_EMU_TARGET_PPC:
2254     case SYS_EMU_TARGET_PPCEMB:
2255     case SYS_EMU_TARGET_PPC64:
2256         return CPU_INFO_ARCH_PPC;
2257
2258     case SYS_EMU_TARGET_SPARC:
2259     case SYS_EMU_TARGET_SPARC64:
2260         return CPU_INFO_ARCH_SPARC;
2261
2262     case SYS_EMU_TARGET_MIPS:
2263     case SYS_EMU_TARGET_MIPSEL:
2264     case SYS_EMU_TARGET_MIPS64:
2265     case SYS_EMU_TARGET_MIPS64EL:
2266         return CPU_INFO_ARCH_MIPS;
2267
2268     case SYS_EMU_TARGET_TRICORE:
2269         return CPU_INFO_ARCH_TRICORE;
2270
2271     case SYS_EMU_TARGET_S390X:
2272         return CPU_INFO_ARCH_S390;
2273
2274     case SYS_EMU_TARGET_RISCV32:
2275     case SYS_EMU_TARGET_RISCV64:
2276         return CPU_INFO_ARCH_RISCV;
2277
2278     default:
2279         return CPU_INFO_ARCH_OTHER;
2280     }
2281 }
2282
2283 static void cpustate_to_cpuinfo_s390(CpuInfoS390 *info, const CPUState *cpu)
2284 {
2285 #ifdef TARGET_S390X
2286     S390CPU *s390_cpu = S390_CPU(cpu);
2287     CPUS390XState *env = &s390_cpu->env;
2288
2289     info->cpu_state = env->cpu_state;
2290 #else
2291     abort();
2292 #endif
2293 }
2294
2295 /*
2296  * fast means: we NEVER interrupt vCPU threads to retrieve
2297  * information from KVM.
2298  */
2299 CpuInfoFastList *qmp_query_cpus_fast(Error **errp)
2300 {
2301     MachineState *ms = MACHINE(qdev_get_machine());
2302     MachineClass *mc = MACHINE_GET_CLASS(ms);
2303     CpuInfoFastList *head = NULL, *cur_item = NULL;
2304     SysEmuTarget target = qapi_enum_parse(&SysEmuTarget_lookup, TARGET_NAME,
2305                                           -1, &error_abort);
2306     CPUState *cpu;
2307
2308     CPU_FOREACH(cpu) {
2309         CpuInfoFastList *info = g_malloc0(sizeof(*info));
2310         info->value = g_malloc0(sizeof(*info->value));
2311
2312         info->value->cpu_index = cpu->cpu_index;
2313         info->value->qom_path = object_get_canonical_path(OBJECT(cpu));
2314         info->value->thread_id = cpu->thread_id;
2315
2316         info->value->has_props = !!mc->cpu_index_to_instance_props;
2317         if (info->value->has_props) {
2318             CpuInstanceProperties *props;
2319             props = g_malloc0(sizeof(*props));
2320             *props = mc->cpu_index_to_instance_props(ms, cpu->cpu_index);
2321             info->value->props = props;
2322         }
2323
2324         info->value->arch = sysemu_target_to_cpuinfo_arch(target);
2325         info->value->target = target;
2326         if (target == SYS_EMU_TARGET_S390X) {
2327             cpustate_to_cpuinfo_s390(&info->value->u.s390x, cpu);
2328         }
2329
2330         if (!cur_item) {
2331             head = cur_item = info;
2332         } else {
2333             cur_item->next = info;
2334             cur_item = info;
2335         }
2336     }
2337
2338     return head;
2339 }
2340
2341 void qmp_memsave(int64_t addr, int64_t size, const char *filename,
2342                  bool has_cpu, int64_t cpu_index, Error **errp)
2343 {
2344     FILE *f;
2345     uint32_t l;
2346     CPUState *cpu;
2347     uint8_t buf[1024];
2348     int64_t orig_addr = addr, orig_size = size;
2349
2350     if (!has_cpu) {
2351         cpu_index = 0;
2352     }
2353
2354     cpu = qemu_get_cpu(cpu_index);
2355     if (cpu == NULL) {
2356         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cpu-index",
2357                    "a CPU number");
2358         return;
2359     }
2360
2361     f = fopen(filename, "wb");
2362     if (!f) {
2363         error_setg_file_open(errp, errno, filename);
2364         return;
2365     }
2366
2367     while (size != 0) {
2368         l = sizeof(buf);
2369         if (l > size)
2370             l = size;
2371         if (cpu_memory_rw_debug(cpu, addr, buf, l, 0) != 0) {
2372             error_setg(errp, "Invalid addr 0x%016" PRIx64 "/size %" PRId64
2373                              " specified", orig_addr, orig_size);
2374             goto exit;
2375         }
2376         if (fwrite(buf, 1, l, f) != l) {
2377             error_setg(errp, QERR_IO_ERROR);
2378             goto exit;
2379         }
2380         addr += l;
2381         size -= l;
2382     }
2383
2384 exit:
2385     fclose(f);
2386 }
2387
2388 void qmp_pmemsave(int64_t addr, int64_t size, const char *filename,
2389                   Error **errp)
2390 {
2391     FILE *f;
2392     uint32_t l;
2393     uint8_t buf[1024];
2394
2395     f = fopen(filename, "wb");
2396     if (!f) {
2397         error_setg_file_open(errp, errno, filename);
2398         return;
2399     }
2400
2401     while (size != 0) {
2402         l = sizeof(buf);
2403         if (l > size)
2404             l = size;
2405         cpu_physical_memory_read(addr, buf, l);
2406         if (fwrite(buf, 1, l, f) != l) {
2407             error_setg(errp, QERR_IO_ERROR);
2408             goto exit;
2409         }
2410         addr += l;
2411         size -= l;
2412     }
2413
2414 exit:
2415     fclose(f);
2416 }
2417
2418 void qmp_inject_nmi(Error **errp)
2419 {
2420     nmi_monitor_handle(monitor_get_cpu_index(), errp);
2421 }
2422
2423 void dump_drift_info(FILE *f, fprintf_function cpu_fprintf)
2424 {
2425     if (!use_icount) {
2426         return;
2427     }
2428
2429     cpu_fprintf(f, "Host - Guest clock  %"PRIi64" ms\n",
2430                 (cpu_get_clock() - cpu_get_icount())/SCALE_MS);
2431     if (icount_align_option) {
2432         cpu_fprintf(f, "Max guest delay     %"PRIi64" ms\n", -max_delay/SCALE_MS);
2433         cpu_fprintf(f, "Max guest advance   %"PRIi64" ms\n", max_advance/SCALE_MS);
2434     } else {
2435         cpu_fprintf(f, "Max guest delay     NA\n");
2436         cpu_fprintf(f, "Max guest advance   NA\n");
2437     }
2438 }