cpus.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a copy
   7  * of this software and associated documentation files (the "Software"), to deal
   8  * in the Software without restriction, including without limitation the rights
   9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10  * copies of the Software, and to permit persons to whom the Software is
  11  * furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in
  14  * all copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22  * THE SOFTWARE.
  23  */
  24
  25 #include "qemu/osdep.h"
  26 #include "qemu/config-file.h"
  27 #include "cpu.h"
  28 #include "monitor/monitor.h"
  29 #include "qapi/error.h"
  30 #include "qapi/qapi-commands-misc.h"
  31 #include "qapi/qapi-events-run-state.h"
  32 #include "qapi/qmp/qerror.h"
  33 #include "qemu/error-report.h"
  34 #include "qemu/qemu-print.h"
  35 #include "sysemu/sysemu.h"
  36 #include "sysemu/block-backend.h"
  37 #include "exec/gdbstub.h"
  38 #include "sysemu/dma.h"
  39 #include "sysemu/hw_accel.h"
  40 #include "sysemu/kvm.h"
  41 #include "sysemu/hax.h"
  42 #include "sysemu/hvf.h"
  43 #include "sysemu/whpx.h"
  44 #include "exec/exec-all.h"
  45
  46 #include "qemu/thread.h"
  47 #include "sysemu/cpus.h"
  48 #include "sysemu/qtest.h"
  49 #include "qemu/main-loop.h"
  50 #include "qemu/option.h"
  51 #include "qemu/bitmap.h"
  52 #include "qemu/seqlock.h"
  53 #include "qemu/guest-random.h"
  54 #include "tcg.h"
  55 #include "hw/nmi.h"
  56 #include "sysemu/replay.h"
  57 #include "hw/boards.h"
  58
  59 #ifdef CONFIG_LINUX
  60
  61 #include <sys/prctl.h>
  62
  63 #ifndef PR_MCE_KILL
  64 #define PR_MCE_KILL 33
  65 #endif
  66
  67 #ifndef PR_MCE_KILL_SET
  68 #define PR_MCE_KILL_SET 1
  69 #endif
  70
  71 #ifndef PR_MCE_KILL_EARLY
  72 #define PR_MCE_KILL_EARLY 1
  73 #endif
  74
  75 #endif /* CONFIG_LINUX */
  76
  77 int64_t max_delay;
  78 int64_t max_advance;
  79
  80 /* vcpu throttling controls */
  81 static QEMUTimer *throttle_timer;
  82 static unsigned int throttle_percentage;
  83
  84 #define CPU_THROTTLE_PCT_MIN 1
  85 #define CPU_THROTTLE_PCT_MAX 99
  86 #define CPU_THROTTLE_TIMESLICE_NS 10000000
  87
  88 bool cpu_is_stopped(CPUState *cpu)
  89 {
  90     return cpu->stopped || !runstate_is_running();
  91 }
  92
  93 static bool cpu_thread_is_idle(CPUState *cpu)
  94 {
  95     if (cpu->stop || cpu->queued_work_first) {
  96         return false;
  97     }
  98     if (cpu_is_stopped(cpu)) {
  99         return true;
 100     }
 101     if (!cpu->halted || cpu_has_work(cpu) ||
 102         kvm_halt_in_kernel()) {
 103         return false;
 104     }
 105     return true;
 106 }
 107
 108 static bool all_cpu_threads_idle(void)
 109 {
 110     CPUState *cpu;
 111
 112     CPU_FOREACH(cpu) {
 113         if (!cpu_thread_is_idle(cpu)) {
 114             return false;
 115         }
 116     }
 117     return true;
 118 }
 119
 120 /***********************************************************/
 121 /* guest cycle counter */
 122
 123 /* Protected by TimersState seqlock */
 124
 125 static bool icount_sleep = true;
 126 /* Arbitrarily pick 1MIPS as the minimum allowable speed.  */
 127 #define MAX_ICOUNT_SHIFT 10
 128
 129 typedef struct TimersState {
 130     /* Protected by BQL.  */
 131     int64_t cpu_ticks_prev;
 132     int64_t cpu_ticks_offset;
 133
 134     /* Protect fields that can be respectively read outside the
 135      * BQL, and written from multiple threads.
 136      */
 137     QemuSeqLock vm_clock_seqlock;
 138     QemuSpin vm_clock_lock;
 139
 140     int16_t cpu_ticks_enabled;
 141
 142     /* Conversion factor from emulated instructions to virtual clock ticks.  */
 143     int16_t icount_time_shift;
 144
 145     /* Compensate for varying guest execution speed.  */
 146     int64_t qemu_icount_bias;
 147
 148     int64_t vm_clock_warp_start;
 149     int64_t cpu_clock_offset;
 150
 151     /* Only written by TCG thread */
 152     int64_t qemu_icount;
 153
 154     /* for adjusting icount */
 155     QEMUTimer *icount_rt_timer;
 156     QEMUTimer *icount_vm_timer;
 157     QEMUTimer *icount_warp_timer;
 158 } TimersState;
 159
 160 static TimersState timers_state;
 161 bool mttcg_enabled;
 162
 163 /*
 164  * We default to false if we know other options have been enabled
 165  * which are currently incompatible with MTTCG. Otherwise when each
 166  * guest (target) has been updated to support:
 167  *   - atomic instructions
 168  *   - memory ordering primitives (barriers)
 169  * they can set the appropriate CONFIG flags in ${target}-softmmu.mak
 170  *
 171  * Once a guest architecture has been converted to the new primitives
 172  * there are two remaining limitations to check.
 173  *
 174  * - The guest can't be oversized (e.g. 64 bit guest on 32 bit host)
 175  * - The host must have a stronger memory order than the guest
 176  *
 177  * It may be possible in future to support strong guests on weak hosts
 178  * but that will require tagging all load/stores in a guest with their
 179  * implicit memory order requirements which would likely slow things
 180  * down a lot.
 181  */
 182
 183 static bool check_tcg_memory_orders_compatible(void)
 184 {
 185 #if defined(TCG_GUEST_DEFAULT_MO) && defined(TCG_TARGET_DEFAULT_MO)
 186     return (TCG_GUEST_DEFAULT_MO & ~TCG_TARGET_DEFAULT_MO) == 0;
 187 #else
 188     return false;
 189 #endif
 190 }
 191
 192 static bool default_mttcg_enabled(void)
 193 {
 194     if (use_icount || TCG_OVERSIZED_GUEST) {
 195         return false;
 196     } else {
 197 #ifdef TARGET_SUPPORTS_MTTCG
 198         return check_tcg_memory_orders_compatible();
 199 #else
 200         return false;
 201 #endif
 202     }
 203 }
 204
 205 void qemu_tcg_configure(QemuOpts *opts, Error **errp)
 206 {
 207     const char *t = qemu_opt_get(opts, "thread");
 208     if (t) {
 209         if (strcmp(t, "multi") == 0) {
 210             if (TCG_OVERSIZED_GUEST) {
 211                 error_setg(errp, "No MTTCG when guest word size > hosts");
 212             } else if (use_icount) {
 213                 error_setg(errp, "No MTTCG when icount is enabled");
 214             } else {
 215 #ifndef TARGET_SUPPORTS_MTTCG
 216                 warn_report("Guest not yet converted to MTTCG - "
 217                             "you may get unexpected results");
 218 #endif
 219                 if (!check_tcg_memory_orders_compatible()) {
 220                     warn_report("Guest expects a stronger memory ordering "
 221                                 "than the host provides");
 222                     error_printf("This may cause strange/hard to debug errors\n");
 223                 }
 224                 mttcg_enabled = true;
 225             }
 226         } else if (strcmp(t, "single") == 0) {
 227             mttcg_enabled = false;
 228         } else {
 229             error_setg(errp, "Invalid 'thread' setting %s", t);
 230         }
 231     } else {
 232         mttcg_enabled = default_mttcg_enabled();
 233     }
 234 }
 235
 236 /* The current number of executed instructions is based on what we
 237  * originally budgeted minus the current state of the decrementing
 238  * icount counters in extra/u16.low.
 239  */
 240 static int64_t cpu_get_icount_executed(CPUState *cpu)
 241 {
 242     return cpu->icount_budget - (cpu->icount_decr.u16.low + cpu->icount_extra);
 243 }
 244
 245 /*
 246  * Update the global shared timer_state.qemu_icount to take into
 247  * account executed instructions. This is done by the TCG vCPU
 248  * thread so the main-loop can see time has moved forward.
 249  */
 250 static void cpu_update_icount_locked(CPUState *cpu)
 251 {
 252     int64_t executed = cpu_get_icount_executed(cpu);
 253     cpu->icount_budget -= executed;
 254
 255     atomic_set_i64(&timers_state.qemu_icount,
 256                    timers_state.qemu_icount + executed);
 257 }
 258
 259 /*
 260  * Update the global shared timer_state.qemu_icount to take into
 261  * account executed instructions. This is done by the TCG vCPU
 262  * thread so the main-loop can see time has moved forward.
 263  */
 264 void cpu_update_icount(CPUState *cpu)
 265 {
 266     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 267                        &timers_state.vm_clock_lock);
 268     cpu_update_icount_locked(cpu);
 269     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 270                          &timers_state.vm_clock_lock);
 271 }
 272
 273 static int64_t cpu_get_icount_raw_locked(void)
 274 {
 275     CPUState *cpu = current_cpu;
 276
 277     if (cpu && cpu->running) {
 278         if (!cpu->can_do_io) {
 279             error_report("Bad icount read");
 280             exit(1);
 281         }
 282         /* Take into account what has run */
 283         cpu_update_icount_locked(cpu);
 284     }
 285     /* The read is protected by the seqlock, but needs atomic64 to avoid UB */
 286     return atomic_read_i64(&timers_state.qemu_icount);
 287 }
 288
 289 static int64_t cpu_get_icount_locked(void)
 290 {
 291     int64_t icount = cpu_get_icount_raw_locked();
 292     return atomic_read_i64(&timers_state.qemu_icount_bias) +
 293         cpu_icount_to_ns(icount);
 294 }
 295
 296 int64_t cpu_get_icount_raw(void)
 297 {
 298     int64_t icount;
 299     unsigned start;
 300
 301     do {
 302         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 303         icount = cpu_get_icount_raw_locked();
 304     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 305
 306     return icount;
 307 }
 308
 309 /* Return the virtual CPU time, based on the instruction counter.  */
 310 int64_t cpu_get_icount(void)
 311 {
 312     int64_t icount;
 313     unsigned start;
 314
 315     do {
 316         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 317         icount = cpu_get_icount_locked();
 318     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 319
 320     return icount;
 321 }
 322
 323 int64_t cpu_icount_to_ns(int64_t icount)
 324 {
 325     return icount << atomic_read(&timers_state.icount_time_shift);
 326 }
 327
 328 static int64_t cpu_get_ticks_locked(void)
 329 {
 330     int64_t ticks = timers_state.cpu_ticks_offset;
 331     if (timers_state.cpu_ticks_enabled) {
 332         ticks += cpu_get_host_ticks();
 333     }
 334
 335     if (timers_state.cpu_ticks_prev > ticks) {
 336         /* Non increasing ticks may happen if the host uses software suspend.  */
 337         timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks;
 338         ticks = timers_state.cpu_ticks_prev;
 339     }
 340
 341     timers_state.cpu_ticks_prev = ticks;
 342     return ticks;
 343 }
 344
 345 /* return the time elapsed in VM between vm_start and vm_stop.  Unless
 346  * icount is active, cpu_get_ticks() uses units of the host CPU cycle
 347  * counter.
 348  */
 349 int64_t cpu_get_ticks(void)
 350 {
 351     int64_t ticks;
 352
 353     if (use_icount) {
 354         return cpu_get_icount();
 355     }
 356
 357     qemu_spin_lock(&timers_state.vm_clock_lock);
 358     ticks = cpu_get_ticks_locked();
 359     qemu_spin_unlock(&timers_state.vm_clock_lock);
 360     return ticks;
 361 }
 362
 363 static int64_t cpu_get_clock_locked(void)
 364 {
 365     int64_t time;
 366
 367     time = timers_state.cpu_clock_offset;
 368     if (timers_state.cpu_ticks_enabled) {
 369         time += get_clock();
 370     }
 371
 372     return time;
 373 }
 374
 375 /* Return the monotonic time elapsed in VM, i.e.,
 376  * the time between vm_start and vm_stop
 377  */
 378 int64_t cpu_get_clock(void)
 379 {
 380     int64_t ti;
 381     unsigned start;
 382
 383     do {
 384         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 385         ti = cpu_get_clock_locked();
 386     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 387
 388     return ti;
 389 }
 390
 391 /* enable cpu_get_ticks()
 392  * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
 393  */
 394 void cpu_enable_ticks(void)
 395 {
 396     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 397                        &timers_state.vm_clock_lock);
 398     if (!timers_state.cpu_ticks_enabled) {
 399         timers_state.cpu_ticks_offset -= cpu_get_host_ticks();
 400         timers_state.cpu_clock_offset -= get_clock();
 401         timers_state.cpu_ticks_enabled = 1;
 402     }
 403     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 404                        &timers_state.vm_clock_lock);
 405 }
 406
 407 /* disable cpu_get_ticks() : the clock is stopped. You must not call
 408  * cpu_get_ticks() after that.
 409  * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
 410  */
 411 void cpu_disable_ticks(void)
 412 {
 413     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 414                        &timers_state.vm_clock_lock);
 415     if (timers_state.cpu_ticks_enabled) {
 416         timers_state.cpu_ticks_offset += cpu_get_host_ticks();
 417         timers_state.cpu_clock_offset = cpu_get_clock_locked();
 418         timers_state.cpu_ticks_enabled = 0;
 419     }
 420     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 421                          &timers_state.vm_clock_lock);
 422 }
 423
 424 /* Correlation between real and virtual time is always going to be
 425    fairly approximate, so ignore small variation.
 426    When the guest is idle real and virtual time will be aligned in
 427    the IO wait loop.  */
 428 #define ICOUNT_WOBBLE (NANOSECONDS_PER_SECOND / 10)
 429
 430 static void icount_adjust(void)
 431 {
 432     int64_t cur_time;
 433     int64_t cur_icount;
 434     int64_t delta;
 435
 436     /* Protected by TimersState mutex.  */
 437     static int64_t last_delta;
 438
 439     /* If the VM is not running, then do nothing.  */
 440     if (!runstate_is_running()) {
 441         return;
 442     }
 443
 444     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 445                        &timers_state.vm_clock_lock);
 446     cur_time = cpu_get_clock_locked();
 447     cur_icount = cpu_get_icount_locked();
 448
 449     delta = cur_icount - cur_time;
 450     /* FIXME: This is a very crude algorithm, somewhat prone to oscillation.  */
 451     if (delta > 0
 452         && last_delta + ICOUNT_WOBBLE < delta * 2
 453         && timers_state.icount_time_shift > 0) {
 454         /* The guest is getting too far ahead.  Slow time down.  */
 455         atomic_set(&timers_state.icount_time_shift,
 456                    timers_state.icount_time_shift - 1);
 457     }
 458     if (delta < 0
 459         && last_delta - ICOUNT_WOBBLE > delta * 2
 460         && timers_state.icount_time_shift < MAX_ICOUNT_SHIFT) {
 461         /* The guest is getting too far behind.  Speed time up.  */
 462         atomic_set(&timers_state.icount_time_shift,
 463                    timers_state.icount_time_shift + 1);
 464     }
 465     last_delta = delta;
 466     atomic_set_i64(&timers_state.qemu_icount_bias,
 467                    cur_icount - (timers_state.qemu_icount
 468                                  << timers_state.icount_time_shift));
 469     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 470                          &timers_state.vm_clock_lock);
 471 }
 472
 473 static void icount_adjust_rt(void *opaque)
 474 {
 475     timer_mod(timers_state.icount_rt_timer,
 476               qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
 477     icount_adjust();
 478 }
 479
 480 static void icount_adjust_vm(void *opaque)
 481 {
 482     timer_mod(timers_state.icount_vm_timer,
 483                    qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 484                    NANOSECONDS_PER_SECOND / 10);
 485     icount_adjust();
 486 }
 487
 488 static int64_t qemu_icount_round(int64_t count)
 489 {
 490     int shift = atomic_read(&timers_state.icount_time_shift);
 491     return (count + (1 << shift) - 1) >> shift;
 492 }
 493
 494 static void icount_warp_rt(void)
 495 {
 496     unsigned seq;
 497     int64_t warp_start;
 498
 499     /* The icount_warp_timer is rescheduled soon after vm_clock_warp_start
 500      * changes from -1 to another value, so the race here is okay.
 501      */
 502     do {
 503         seq = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 504         warp_start = timers_state.vm_clock_warp_start;
 505     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, seq));
 506
 507     if (warp_start == -1) {
 508         return;
 509     }
 510
 511     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 512                        &timers_state.vm_clock_lock);
 513     if (runstate_is_running()) {
 514         int64_t clock = REPLAY_CLOCK_LOCKED(REPLAY_CLOCK_VIRTUAL_RT,
 515                                             cpu_get_clock_locked());
 516         int64_t warp_delta;
 517
 518         warp_delta = clock - timers_state.vm_clock_warp_start;
 519         if (use_icount == 2) {
 520             /*
 521              * In adaptive mode, do not let QEMU_CLOCK_VIRTUAL run too
 522              * far ahead of real time.
 523              */
 524             int64_t cur_icount = cpu_get_icount_locked();
 525             int64_t delta = clock - cur_icount;
 526             warp_delta = MIN(warp_delta, delta);
 527         }
 528         atomic_set_i64(&timers_state.qemu_icount_bias,
 529                        timers_state.qemu_icount_bias + warp_delta);
 530     }
 531     timers_state.vm_clock_warp_start = -1;
 532     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 533                        &timers_state.vm_clock_lock);
 534
 535     if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL)) {
 536         qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 537     }
 538 }
 539
 540 static void icount_timer_cb(void *opaque)
 541 {
 542     /* No need for a checkpoint because the timer already synchronizes
 543      * with CHECKPOINT_CLOCK_VIRTUAL_RT.
 544      */
 545     icount_warp_rt();
 546 }
 547
 548 void qtest_clock_warp(int64_t dest)
 549 {
 550     int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 551     AioContext *aio_context;
 552     assert(qtest_enabled());
 553     aio_context = qemu_get_aio_context();
 554     while (clock < dest) {
 555         int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
 556         int64_t warp = qemu_soonest_timeout(dest - clock, deadline);
 557
 558         seqlock_write_lock(&timers_state.vm_clock_seqlock,
 559                            &timers_state.vm_clock_lock);
 560         atomic_set_i64(&timers_state.qemu_icount_bias,
 561                        timers_state.qemu_icount_bias + warp);
 562         seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 563                              &timers_state.vm_clock_lock);
 564
 565         qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
 566         timerlist_run_timers(aio_context->tlg.tl[QEMU_CLOCK_VIRTUAL]);
 567         clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 568     }
 569     qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 570 }
 571
 572 void qemu_start_warp_timer(void)
 573 {
 574     int64_t clock;
 575     int64_t deadline;
 576
 577     if (!use_icount) {
 578         return;
 579     }
 580
 581     /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
 582      * do not fire, so computing the deadline does not make sense.
 583      */
 584     if (!runstate_is_running()) {
 585         return;
 586     }
 587
 588     if (replay_mode != REPLAY_MODE_PLAY) {
 589         if (!all_cpu_threads_idle()) {
 590             return;
 591         }
 592
 593         if (qtest_enabled()) {
 594             /* When testing, qtest commands advance icount.  */
 595             return;
 596         }
 597
 598         replay_checkpoint(CHECKPOINT_CLOCK_WARP_START);
 599     } else {
 600         /* warp clock deterministically in record/replay mode */
 601         if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_START)) {
 602             /* vCPU is sleeping and warp can't be started.
 603                It is probably a race condition: notification sent
 604                to vCPU was processed in advance and vCPU went to sleep.
 605                Therefore we have to wake it up for doing someting. */
 606             if (replay_has_checkpoint()) {
 607                 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 608             }
 609             return;
 610         }
 611     }
 612
 613     /* We want to use the earliest deadline from ALL vm_clocks */
 614     clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT);
 615     deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
 616     if (deadline < 0) {
 617         static bool notified;
 618         if (!icount_sleep && !notified) {
 619             warn_report("icount sleep disabled and no active timers");
 620             notified = true;
 621         }
 622         return;
 623     }
 624
 625     if (deadline > 0) {
 626         /*
 627          * Ensure QEMU_CLOCK_VIRTUAL proceeds even when the virtual CPU goes to
 628          * sleep.  Otherwise, the CPU might be waiting for a future timer
 629          * interrupt to wake it up, but the interrupt never comes because
 630          * the vCPU isn't running any insns and thus doesn't advance the
 631          * QEMU_CLOCK_VIRTUAL.
 632          */
 633         if (!icount_sleep) {
 634             /*
 635              * We never let VCPUs sleep in no sleep icount mode.
 636              * If there is a pending QEMU_CLOCK_VIRTUAL timer we just advance
 637              * to the next QEMU_CLOCK_VIRTUAL event and notify it.
 638              * It is useful when we want a deterministic execution time,
 639              * isolated from host latencies.
 640              */
 641             seqlock_write_lock(&timers_state.vm_clock_seqlock,
 642                                &timers_state.vm_clock_lock);
 643             atomic_set_i64(&timers_state.qemu_icount_bias,
 644                            timers_state.qemu_icount_bias + deadline);
 645             seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 646                                  &timers_state.vm_clock_lock);
 647             qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 648         } else {
 649             /*
 650              * We do stop VCPUs and only advance QEMU_CLOCK_VIRTUAL after some
 651              * "real" time, (related to the time left until the next event) has
 652              * passed. The QEMU_CLOCK_VIRTUAL_RT clock will do this.
 653              * This avoids that the warps are visible externally; for example,
 654              * you will not be sending network packets continuously instead of
 655              * every 100ms.
 656              */
 657             seqlock_write_lock(&timers_state.vm_clock_seqlock,
 658                                &timers_state.vm_clock_lock);
 659             if (timers_state.vm_clock_warp_start == -1
 660                 || timers_state.vm_clock_warp_start > clock) {
 661                 timers_state.vm_clock_warp_start = clock;
 662             }
 663             seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 664                                  &timers_state.vm_clock_lock);
 665             timer_mod_anticipate(timers_state.icount_warp_timer,
 666                                  clock + deadline);
 667         }
 668     } else if (deadline == 0) {
 669         qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 670     }
 671 }
 672
 673 static void qemu_account_warp_timer(void)
 674 {
 675     if (!use_icount || !icount_sleep) {
 676         return;
 677     }
 678
 679     /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
 680      * do not fire, so computing the deadline does not make sense.
 681      */
 682     if (!runstate_is_running()) {
 683         return;
 684     }
 685
 686     /* warp clock deterministically in record/replay mode */
 687     if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_ACCOUNT)) {
 688         return;
 689     }
 690
 691     timer_del(timers_state.icount_warp_timer);
 692     icount_warp_rt();
 693 }
 694
 695 static bool icount_state_needed(void *opaque)
 696 {
 697     return use_icount;
 698 }
 699
 700 static bool warp_timer_state_needed(void *opaque)
 701 {
 702     TimersState *s = opaque;
 703     return s->icount_warp_timer != NULL;
 704 }
 705
 706 static bool adjust_timers_state_needed(void *opaque)
 707 {
 708     TimersState *s = opaque;
 709     return s->icount_rt_timer != NULL;
 710 }
 711
 712 /*
 713  * Subsection for warp timer migration is optional, because may not be created
 714  */
 715 static const VMStateDescription icount_vmstate_warp_timer = {
 716     .name = "timer/icount/warp_timer",
 717     .version_id = 1,
 718     .minimum_version_id = 1,
 719     .needed = warp_timer_state_needed,
 720     .fields = (VMStateField[]) {
 721         VMSTATE_INT64(vm_clock_warp_start, TimersState),
 722         VMSTATE_TIMER_PTR(icount_warp_timer, TimersState),
 723         VMSTATE_END_OF_LIST()
 724     }
 725 };
 726
 727 static const VMStateDescription icount_vmstate_adjust_timers = {
 728     .name = "timer/icount/timers",
 729     .version_id = 1,
 730     .minimum_version_id = 1,
 731     .needed = adjust_timers_state_needed,
 732     .fields = (VMStateField[]) {
 733         VMSTATE_TIMER_PTR(icount_rt_timer, TimersState),
 734         VMSTATE_TIMER_PTR(icount_vm_timer, TimersState),
 735         VMSTATE_END_OF_LIST()
 736     }
 737 };
 738
 739 /*
 740  * This is a subsection for icount migration.
 741  */
 742 static const VMStateDescription icount_vmstate_timers = {
 743     .name = "timer/icount",
 744     .version_id = 1,
 745     .minimum_version_id = 1,
 746     .needed = icount_state_needed,
 747     .fields = (VMStateField[]) {
 748         VMSTATE_INT64(qemu_icount_bias, TimersState),
 749         VMSTATE_INT64(qemu_icount, TimersState),
 750         VMSTATE_END_OF_LIST()
 751     },
 752     .subsections = (const VMStateDescription*[]) {
 753         &icount_vmstate_warp_timer,
 754         &icount_vmstate_adjust_timers,
 755         NULL
 756     }
 757 };
 758
 759 static const VMStateDescription vmstate_timers = {
 760     .name = "timer",
 761     .version_id = 2,
 762     .minimum_version_id = 1,
 763     .fields = (VMStateField[]) {
 764         VMSTATE_INT64(cpu_ticks_offset, TimersState),
 765         VMSTATE_UNUSED(8),
 766         VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2),
 767         VMSTATE_END_OF_LIST()
 768     },
 769     .subsections = (const VMStateDescription*[]) {
 770         &icount_vmstate_timers,
 771         NULL
 772     }
 773 };
 774
 775 static void cpu_throttle_thread(CPUState *cpu, run_on_cpu_data opaque)
 776 {
 777     double pct;
 778     double throttle_ratio;
 779     long sleeptime_ns;
 780
 781     if (!cpu_throttle_get_percentage()) {
 782         return;
 783     }
 784
 785     pct = (double)cpu_throttle_get_percentage()/100;
 786     throttle_ratio = pct / (1 - pct);
 787     sleeptime_ns = (long)(throttle_ratio * CPU_THROTTLE_TIMESLICE_NS);
 788
 789     qemu_mutex_unlock_iothread();
 790     g_usleep(sleeptime_ns / 1000); /* Convert ns to us for usleep call */
 791     qemu_mutex_lock_iothread();
 792     atomic_set(&cpu->throttle_thread_scheduled, 0);
 793 }
 794
 795 static void cpu_throttle_timer_tick(void *opaque)
 796 {
 797     CPUState *cpu;
 798     double pct;
 799
 800     /* Stop the timer if needed */
 801     if (!cpu_throttle_get_percentage()) {
 802         return;
 803     }
 804     CPU_FOREACH(cpu) {
 805         if (!atomic_xchg(&cpu->throttle_thread_scheduled, 1)) {
 806             async_run_on_cpu(cpu, cpu_throttle_thread,
 807                              RUN_ON_CPU_NULL);
 808         }
 809     }
 810
 811     pct = (double)cpu_throttle_get_percentage()/100;
 812     timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
 813                                    CPU_THROTTLE_TIMESLICE_NS / (1-pct));
 814 }
 815
 816 void cpu_throttle_set(int new_throttle_pct)
 817 {
 818     /* Ensure throttle percentage is within valid range */
 819     new_throttle_pct = MIN(new_throttle_pct, CPU_THROTTLE_PCT_MAX);
 820     new_throttle_pct = MAX(new_throttle_pct, CPU_THROTTLE_PCT_MIN);
 821
 822     atomic_set(&throttle_percentage, new_throttle_pct);
 823
 824     timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
 825                                        CPU_THROTTLE_TIMESLICE_NS);
 826 }
 827
 828 void cpu_throttle_stop(void)
 829 {
 830     atomic_set(&throttle_percentage, 0);
 831 }
 832
 833 bool cpu_throttle_active(void)
 834 {
 835     return (cpu_throttle_get_percentage() != 0);
 836 }
 837
 838 int cpu_throttle_get_percentage(void)
 839 {
 840     return atomic_read(&throttle_percentage);
 841 }
 842
 843 void cpu_ticks_init(void)
 844 {
 845     seqlock_init(&timers_state.vm_clock_seqlock);
 846     qemu_spin_init(&timers_state.vm_clock_lock);
 847     vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
 848     throttle_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
 849                                            cpu_throttle_timer_tick, NULL);
 850 }
 851
 852 void configure_icount(QemuOpts *opts, Error **errp)
 853 {
 854     const char *option;
 855     char *rem_str = NULL;
 856
 857     option = qemu_opt_get(opts, "shift");
 858     if (!option) {
 859         if (qemu_opt_get(opts, "align") != NULL) {
 860             error_setg(errp, "Please specify shift option when using align");
 861         }
 862         return;
 863     }
 864
 865     icount_sleep = qemu_opt_get_bool(opts, "sleep", true);
 866     if (icount_sleep) {
 867         timers_state.icount_warp_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
 868                                          icount_timer_cb, NULL);
 869     }
 870
 871     icount_align_option = qemu_opt_get_bool(opts, "align", false);
 872
 873     if (icount_align_option && !icount_sleep) {
 874         error_setg(errp, "align=on and sleep=off are incompatible");
 875     }
 876     if (strcmp(option, "auto") != 0) {
 877         errno = 0;
 878         timers_state.icount_time_shift = strtol(option, &rem_str, 0);
 879         if (errno != 0 || *rem_str != '\0' || !strlen(option)) {
 880             error_setg(errp, "icount: Invalid shift value");
 881         }
 882         use_icount = 1;
 883         return;
 884     } else if (icount_align_option) {
 885         error_setg(errp, "shift=auto and align=on are incompatible");
 886     } else if (!icount_sleep) {
 887         error_setg(errp, "shift=auto and sleep=off are incompatible");
 888     }
 889
 890     use_icount = 2;
 891
 892     /* 125MIPS seems a reasonable initial guess at the guest speed.
 893        It will be corrected fairly quickly anyway.  */
 894     timers_state.icount_time_shift = 3;
 895
 896     /* Have both realtime and virtual time triggers for speed adjustment.
 897        The realtime trigger catches emulated time passing too slowly,
 898        the virtual time trigger catches emulated time passing too fast.
 899        Realtime triggers occur even when idle, so use them less frequently
 900        than VM triggers.  */
 901     timers_state.vm_clock_warp_start = -1;
 902     timers_state.icount_rt_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL_RT,
 903                                    icount_adjust_rt, NULL);
 904     timer_mod(timers_state.icount_rt_timer,
 905                    qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
 906     timers_state.icount_vm_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 907                                         icount_adjust_vm, NULL);
 908     timer_mod(timers_state.icount_vm_timer,
 909                    qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 910                    NANOSECONDS_PER_SECOND / 10);
 911 }
 912
 913 /***********************************************************/
 914 /* TCG vCPU kick timer
 915  *
 916  * The kick timer is responsible for moving single threaded vCPU
 917  * emulation on to the next vCPU. If more than one vCPU is running a
 918  * timer event with force a cpu->exit so the next vCPU can get
 919  * scheduled.
 920  *
 921  * The timer is removed if all vCPUs are idle and restarted again once
 922  * idleness is complete.
 923  */
 924
 925 static QEMUTimer *tcg_kick_vcpu_timer;
 926 static CPUState *tcg_current_rr_cpu;
 927
 928 #define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
 929
 930 static inline int64_t qemu_tcg_next_kick(void)
 931 {
 932     return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + TCG_KICK_PERIOD;
 933 }
 934
 935 /* Kick the currently round-robin scheduled vCPU */
 936 static void qemu_cpu_kick_rr_cpu(void)
 937 {
 938     CPUState *cpu;
 939     do {
 940         cpu = atomic_mb_read(&tcg_current_rr_cpu);
 941         if (cpu) {
 942             cpu_exit(cpu);
 943         }
 944     } while (cpu != atomic_mb_read(&tcg_current_rr_cpu));
 945 }
 946
 947 static void do_nothing(CPUState *cpu, run_on_cpu_data unused)
 948 {
 949 }
 950
 951 void qemu_timer_notify_cb(void *opaque, QEMUClockType type)
 952 {
 953     if (!use_icount || type != QEMU_CLOCK_VIRTUAL) {
 954         qemu_notify_event();
 955         return;
 956     }
 957
 958     if (qemu_in_vcpu_thread()) {
 959         /* A CPU is currently running; kick it back out to the
 960          * tcg_cpu_exec() loop so it will recalculate its
 961          * icount deadline immediately.
 962          */
 963         qemu_cpu_kick(current_cpu);
 964     } else if (first_cpu) {
 965         /* qemu_cpu_kick is not enough to kick a halted CPU out of
 966          * qemu_tcg_wait_io_event.  async_run_on_cpu, instead,
 967          * causes cpu_thread_is_idle to return false.  This way,
 968          * handle_icount_deadline can run.
 969          * If we have no CPUs at all for some reason, we don't
 970          * need to do anything.
 971          */
 972         async_run_on_cpu(first_cpu, do_nothing, RUN_ON_CPU_NULL);
 973     }
 974 }
 975
 976 static void kick_tcg_thread(void *opaque)
 977 {
 978     timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 979     qemu_cpu_kick_rr_cpu();
 980 }
 981
 982 static void start_tcg_kick_timer(void)
 983 {
 984     assert(!mttcg_enabled);
 985     if (!tcg_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
 986         tcg_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 987                                            kick_tcg_thread, NULL);
 988     }
 989     if (tcg_kick_vcpu_timer && !timer_pending(tcg_kick_vcpu_timer)) {
 990         timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 991     }
 992 }
 993
 994 static void stop_tcg_kick_timer(void)
 995 {
 996     assert(!mttcg_enabled);
 997     if (tcg_kick_vcpu_timer && timer_pending(tcg_kick_vcpu_timer)) {
 998         timer_del(tcg_kick_vcpu_timer);
 999     }
1000 }
1001
1002 /***********************************************************/
1003 void hw_error(const char *fmt, ...)
1004 {
1005     va_list ap;
1006     CPUState *cpu;
1007
1008     va_start(ap, fmt);
1009     fprintf(stderr, "qemu: hardware error: ");
1010     vfprintf(stderr, fmt, ap);
1011     fprintf(stderr, "\n");
1012     CPU_FOREACH(cpu) {
1013         fprintf(stderr, "CPU #%d:\n", cpu->cpu_index);
1014         cpu_dump_state(cpu, stderr, CPU_DUMP_FPU);
1015     }
1016     va_end(ap);
1017     abort();
1018 }
1019
1020 void cpu_synchronize_all_states(void)
1021 {
1022     CPUState *cpu;
1023
1024     CPU_FOREACH(cpu) {
1025         cpu_synchronize_state(cpu);
1026         /* TODO: move to cpu_synchronize_state() */
1027         if (hvf_enabled()) {
1028             hvf_cpu_synchronize_state(cpu);
1029         }
1030     }
1031 }
1032
1033 void cpu_synchronize_all_post_reset(void)
1034 {
1035     CPUState *cpu;
1036
1037     CPU_FOREACH(cpu) {
1038         cpu_synchronize_post_reset(cpu);
1039         /* TODO: move to cpu_synchronize_post_reset() */
1040         if (hvf_enabled()) {
1041             hvf_cpu_synchronize_post_reset(cpu);
1042         }
1043     }
1044 }
1045
1046 void cpu_synchronize_all_post_init(void)
1047 {
1048     CPUState *cpu;
1049
1050     CPU_FOREACH(cpu) {
1051         cpu_synchronize_post_init(cpu);
1052         /* TODO: move to cpu_synchronize_post_init() */
1053         if (hvf_enabled()) {
1054             hvf_cpu_synchronize_post_init(cpu);
1055         }
1056     }
1057 }
1058
1059 void cpu_synchronize_all_pre_loadvm(void)
1060 {
1061     CPUState *cpu;
1062
1063     CPU_FOREACH(cpu) {
1064         cpu_synchronize_pre_loadvm(cpu);
1065     }
1066 }
1067
1068 static int do_vm_stop(RunState state, bool send_stop)
1069 {
1070     int ret = 0;
1071
1072     if (runstate_is_running()) {
1073         cpu_disable_ticks();
1074         pause_all_vcpus();
1075         runstate_set(state);
1076         vm_state_notify(0, state);
1077         if (send_stop) {
1078             qapi_event_send_stop();
1079         }
1080     }
1081
1082     bdrv_drain_all();
1083     replay_disable_events();
1084     ret = bdrv_flush_all();
1085
1086     return ret;
1087 }
1088
1089 /* Special vm_stop() variant for terminating the process.  Historically clients
1090  * did not expect a QMP STOP event and so we need to retain compatibility.
1091  */
1092 int vm_shutdown(void)
1093 {
1094     return do_vm_stop(RUN_STATE_SHUTDOWN, false);
1095 }
1096
1097 static bool cpu_can_run(CPUState *cpu)
1098 {
1099     if (cpu->stop) {
1100         return false;
1101     }
1102     if (cpu_is_stopped(cpu)) {
1103         return false;
1104     }
1105     return true;
1106 }
1107
1108 static void cpu_handle_guest_debug(CPUState *cpu)
1109 {
1110     gdb_set_stop_cpu(cpu);
1111     qemu_system_debug_request();
1112     cpu->stopped = true;
1113 }
1114
1115 #ifdef CONFIG_LINUX
1116 static void sigbus_reraise(void)
1117 {
1118     sigset_t set;
1119     struct sigaction action;
1120
1121     memset(&action, 0, sizeof(action));
1122     action.sa_handler = SIG_DFL;
1123     if (!sigaction(SIGBUS, &action, NULL)) {
1124         raise(SIGBUS);
1125         sigemptyset(&set);
1126         sigaddset(&set, SIGBUS);
1127         pthread_sigmask(SIG_UNBLOCK, &set, NULL);
1128     }
1129     perror("Failed to re-raise SIGBUS!\n");
1130     abort();
1131 }
1132
1133 static void sigbus_handler(int n, siginfo_t *siginfo, void *ctx)
1134 {
1135     if (siginfo->si_code != BUS_MCEERR_AO && siginfo->si_code != BUS_MCEERR_AR) {
1136         sigbus_reraise();
1137     }
1138
1139     if (current_cpu) {
1140         /* Called asynchronously in VCPU thread.  */
1141         if (kvm_on_sigbus_vcpu(current_cpu, siginfo->si_code, siginfo->si_addr)) {
1142             sigbus_reraise();
1143         }
1144     } else {
1145         /* Called synchronously (via signalfd) in main thread.  */
1146         if (kvm_on_sigbus(siginfo->si_code, siginfo->si_addr)) {
1147             sigbus_reraise();
1148         }
1149     }
1150 }
1151
1152 static void qemu_init_sigbus(void)
1153 {
1154     struct sigaction action;
1155
1156     memset(&action, 0, sizeof(action));
1157     action.sa_flags = SA_SIGINFO;
1158     action.sa_sigaction = sigbus_handler;
1159     sigaction(SIGBUS, &action, NULL);
1160
1161     prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0);
1162 }
1163 #else /* !CONFIG_LINUX */
1164 static void qemu_init_sigbus(void)
1165 {
1166 }
1167 #endif /* !CONFIG_LINUX */
1168
1169 static QemuMutex qemu_global_mutex;
1170
1171 static QemuThread io_thread;
1172
1173 /* cpu creation */
1174 static QemuCond qemu_cpu_cond;
1175 /* system init */
1176 static QemuCond qemu_pause_cond;
1177
1178 void qemu_init_cpu_loop(void)
1179 {
1180     qemu_init_sigbus();
1181     qemu_cond_init(&qemu_cpu_cond);
1182     qemu_cond_init(&qemu_pause_cond);
1183     qemu_mutex_init(&qemu_global_mutex);
1184
1185     qemu_thread_get_self(&io_thread);
1186 }
1187
1188 void run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data)
1189 {
1190     do_run_on_cpu(cpu, func, data, &qemu_global_mutex);
1191 }
1192
1193 static void qemu_kvm_destroy_vcpu(CPUState *cpu)
1194 {
1195     if (kvm_destroy_vcpu(cpu) < 0) {
1196         error_report("kvm_destroy_vcpu failed");
1197         exit(EXIT_FAILURE);
1198     }
1199 }
1200
1201 static void qemu_tcg_destroy_vcpu(CPUState *cpu)
1202 {
1203 }
1204
1205 static void qemu_cpu_stop(CPUState *cpu, bool exit)
1206 {
1207     g_assert(qemu_cpu_is_self(cpu));
1208     cpu->stop = false;
1209     cpu->stopped = true;
1210     if (exit) {
1211         cpu_exit(cpu);
1212     }
1213     qemu_cond_broadcast(&qemu_pause_cond);
1214 }
1215
1216 static void qemu_wait_io_event_common(CPUState *cpu)
1217 {
1218     atomic_mb_set(&cpu->thread_kicked, false);
1219     if (cpu->stop) {
1220         qemu_cpu_stop(cpu, false);
1221     }
1222     process_queued_cpu_work(cpu);
1223 }
1224
1225 static void qemu_tcg_rr_wait_io_event(void)
1226 {
1227     CPUState *cpu;
1228
1229     while (all_cpu_threads_idle()) {
1230         stop_tcg_kick_timer();
1231         qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1232     }
1233
1234     start_tcg_kick_timer();
1235
1236     CPU_FOREACH(cpu) {
1237         qemu_wait_io_event_common(cpu);
1238     }
1239 }
1240
1241 static void qemu_wait_io_event(CPUState *cpu)
1242 {
1243     while (cpu_thread_is_idle(cpu)) {
1244         qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1245     }
1246
1247 #ifdef _WIN32
1248     /* Eat dummy APC queued by qemu_cpu_kick_thread.  */
1249     if (!tcg_enabled()) {
1250         SleepEx(0, TRUE);
1251     }
1252 #endif
1253     qemu_wait_io_event_common(cpu);
1254 }
1255
1256 static void *qemu_kvm_cpu_thread_fn(void *arg)
1257 {
1258     CPUState *cpu = arg;
1259     int r;
1260
1261     rcu_register_thread();
1262
1263     qemu_mutex_lock_iothread();
1264     qemu_thread_get_self(cpu->thread);
1265     cpu->thread_id = qemu_get_thread_id();
1266     cpu->can_do_io = 1;
1267     current_cpu = cpu;
1268
1269     r = kvm_init_vcpu(cpu);
1270     if (r < 0) {
1271         error_report("kvm_init_vcpu failed: %s", strerror(-r));
1272         exit(1);
1273     }
1274
1275     kvm_init_cpu_signals(cpu);
1276
1277     /* signal CPU creation */
1278     cpu->created = true;
1279     qemu_cond_signal(&qemu_cpu_cond);
1280     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1281
1282     do {
1283         if (cpu_can_run(cpu)) {
1284             r = kvm_cpu_exec(cpu);
1285             if (r == EXCP_DEBUG) {
1286                 cpu_handle_guest_debug(cpu);
1287             }
1288         }
1289         qemu_wait_io_event(cpu);
1290     } while (!cpu->unplug || cpu_can_run(cpu));
1291
1292     qemu_kvm_destroy_vcpu(cpu);
1293     cpu->created = false;
1294     qemu_cond_signal(&qemu_cpu_cond);
1295     qemu_mutex_unlock_iothread();
1296     rcu_unregister_thread();
1297     return NULL;
1298 }
1299
1300 static void *qemu_dummy_cpu_thread_fn(void *arg)
1301 {
1302 #ifdef _WIN32
1303     error_report("qtest is not supported under Windows");
1304     exit(1);
1305 #else
1306     CPUState *cpu = arg;
1307     sigset_t waitset;
1308     int r;
1309
1310     rcu_register_thread();
1311
1312     qemu_mutex_lock_iothread();
1313     qemu_thread_get_self(cpu->thread);
1314     cpu->thread_id = qemu_get_thread_id();
1315     cpu->can_do_io = 1;
1316     current_cpu = cpu;
1317
1318     sigemptyset(&waitset);
1319     sigaddset(&waitset, SIG_IPI);
1320
1321     /* signal CPU creation */
1322     cpu->created = true;
1323     qemu_cond_signal(&qemu_cpu_cond);
1324     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1325
1326     do {
1327         qemu_mutex_unlock_iothread();
1328         do {
1329             int sig;
1330             r = sigwait(&waitset, &sig);
1331         } while (r == -1 && (errno == EAGAIN || errno == EINTR));
1332         if (r == -1) {
1333             perror("sigwait");
1334             exit(1);
1335         }
1336         qemu_mutex_lock_iothread();
1337         qemu_wait_io_event(cpu);
1338     } while (!cpu->unplug);
1339
1340     qemu_mutex_unlock_iothread();
1341     rcu_unregister_thread();
1342     return NULL;
1343 #endif
1344 }
1345
1346 static int64_t tcg_get_icount_limit(void)
1347 {
1348     int64_t deadline;
1349
1350     if (replay_mode != REPLAY_MODE_PLAY) {
1351         deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1352
1353         /* Maintain prior (possibly buggy) behaviour where if no deadline
1354          * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
1355          * INT32_MAX nanoseconds ahead, we still use INT32_MAX
1356          * nanoseconds.
1357          */
1358         if ((deadline < 0) || (deadline > INT32_MAX)) {
1359             deadline = INT32_MAX;
1360         }
1361
1362         return qemu_icount_round(deadline);
1363     } else {
1364         return replay_get_instructions();
1365     }
1366 }
1367
1368 static void handle_icount_deadline(void)
1369 {
1370     assert(qemu_in_vcpu_thread());
1371     if (use_icount) {
1372         int64_t deadline =
1373             qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1374
1375         if (deadline == 0) {
1376             /* Wake up other AioContexts.  */
1377             qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
1378             qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
1379         }
1380     }
1381 }
1382
1383 static void prepare_icount_for_run(CPUState *cpu)
1384 {
1385     if (use_icount) {
1386         int insns_left;
1387
1388         /* These should always be cleared by process_icount_data after
1389          * each vCPU execution. However u16.high can be raised
1390          * asynchronously by cpu_exit/cpu_interrupt/tcg_handle_interrupt
1391          */
1392         g_assert(cpu->icount_decr.u16.low == 0);
1393         g_assert(cpu->icount_extra == 0);
1394
1395         cpu->icount_budget = tcg_get_icount_limit();
1396         insns_left = MIN(0xffff, cpu->icount_budget);
1397         cpu->icount_decr.u16.low = insns_left;
1398         cpu->icount_extra = cpu->icount_budget - insns_left;
1399
1400         replay_mutex_lock();
1401     }
1402 }
1403
1404 static void process_icount_data(CPUState *cpu)
1405 {
1406     if (use_icount) {
1407         /* Account for executed instructions */
1408         cpu_update_icount(cpu);
1409
1410         /* Reset the counters */
1411         cpu->icount_decr.u16.low = 0;
1412         cpu->icount_extra = 0;
1413         cpu->icount_budget = 0;
1414
1415         replay_account_executed_instructions();
1416
1417         replay_mutex_unlock();
1418     }
1419 }
1420
1421
1422 static int tcg_cpu_exec(CPUState *cpu)
1423 {
1424     int ret;
1425 #ifdef CONFIG_PROFILER
1426     int64_t ti;
1427 #endif
1428
1429     assert(tcg_enabled());
1430 #ifdef CONFIG_PROFILER
1431     ti = profile_getclock();
1432 #endif
1433     cpu_exec_start(cpu);
1434     ret = cpu_exec(cpu);
1435     cpu_exec_end(cpu);
1436 #ifdef CONFIG_PROFILER
1437     atomic_set(&tcg_ctx->prof.cpu_exec_time,
1438                tcg_ctx->prof.cpu_exec_time + profile_getclock() - ti);
1439 #endif
1440     return ret;
1441 }
1442
1443 /* Destroy any remaining vCPUs which have been unplugged and have
1444  * finished running
1445  */
1446 static void deal_with_unplugged_cpus(void)
1447 {
1448     CPUState *cpu;
1449
1450     CPU_FOREACH(cpu) {
1451         if (cpu->unplug && !cpu_can_run(cpu)) {
1452             qemu_tcg_destroy_vcpu(cpu);
1453             cpu->created = false;
1454             qemu_cond_signal(&qemu_cpu_cond);
1455             break;
1456         }
1457     }
1458 }
1459
1460 /* Single-threaded TCG
1461  *
1462  * In the single-threaded case each vCPU is simulated in turn. If
1463  * there is more than a single vCPU we create a simple timer to kick
1464  * the vCPU and ensure we don't get stuck in a tight loop in one vCPU.
1465  * This is done explicitly rather than relying on side-effects
1466  * elsewhere.
1467  */
1468
1469 static void *qemu_tcg_rr_cpu_thread_fn(void *arg)
1470 {
1471     CPUState *cpu = arg;
1472
1473     assert(tcg_enabled());
1474     rcu_register_thread();
1475     tcg_register_thread();
1476
1477     qemu_mutex_lock_iothread();
1478     qemu_thread_get_self(cpu->thread);
1479
1480     cpu->thread_id = qemu_get_thread_id();
1481     cpu->created = true;
1482     cpu->can_do_io = 1;
1483     qemu_cond_signal(&qemu_cpu_cond);
1484     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1485
1486     /* wait for initial kick-off after machine start */
1487     while (first_cpu->stopped) {
1488         qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1489
1490         /* process any pending work */
1491         CPU_FOREACH(cpu) {
1492             current_cpu = cpu;
1493             qemu_wait_io_event_common(cpu);
1494         }
1495     }
1496
1497     start_tcg_kick_timer();
1498
1499     cpu = first_cpu;
1500
1501     /* process any pending work */
1502     cpu->exit_request = 1;
1503
1504     while (1) {
1505         qemu_mutex_unlock_iothread();
1506         replay_mutex_lock();
1507         qemu_mutex_lock_iothread();
1508         /* Account partial waits to QEMU_CLOCK_VIRTUAL.  */
1509         qemu_account_warp_timer();
1510
1511         /* Run the timers here.  This is much more efficient than
1512          * waking up the I/O thread and waiting for completion.
1513          */
1514         handle_icount_deadline();
1515
1516         replay_mutex_unlock();
1517
1518         if (!cpu) {
1519             cpu = first_cpu;
1520         }
1521
1522         while (cpu && !cpu->queued_work_first && !cpu->exit_request) {
1523
1524             atomic_mb_set(&tcg_current_rr_cpu, cpu);
1525             current_cpu = cpu;
1526
1527             qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
1528                               (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
1529
1530             if (cpu_can_run(cpu)) {
1531                 int r;
1532
1533                 qemu_mutex_unlock_iothread();
1534                 prepare_icount_for_run(cpu);
1535
1536                 r = tcg_cpu_exec(cpu);
1537
1538                 process_icount_data(cpu);
1539                 qemu_mutex_lock_iothread();
1540
1541                 if (r == EXCP_DEBUG) {
1542                     cpu_handle_guest_debug(cpu);
1543                     break;
1544                 } else if (r == EXCP_ATOMIC) {
1545                     qemu_mutex_unlock_iothread();
1546                     cpu_exec_step_atomic(cpu);
1547                     qemu_mutex_lock_iothread();
1548                     break;
1549                 }
1550             } else if (cpu->stop) {
1551                 if (cpu->unplug) {
1552                     cpu = CPU_NEXT(cpu);
1553                 }
1554                 break;
1555             }
1556
1557             cpu = CPU_NEXT(cpu);
1558         } /* while (cpu && !cpu->exit_request).. */
1559
1560         /* Does not need atomic_mb_set because a spurious wakeup is okay.  */
1561         atomic_set(&tcg_current_rr_cpu, NULL);
1562
1563         if (cpu && cpu->exit_request) {
1564             atomic_mb_set(&cpu->exit_request, 0);
1565         }
1566
1567         if (use_icount && all_cpu_threads_idle()) {
1568             /*
1569              * When all cpus are sleeping (e.g in WFI), to avoid a deadlock
1570              * in the main_loop, wake it up in order to start the warp timer.
1571              */
1572             qemu_notify_event();
1573         }
1574
1575         qemu_tcg_rr_wait_io_event();
1576         deal_with_unplugged_cpus();
1577     }
1578
1579     rcu_unregister_thread();
1580     return NULL;
1581 }
1582
1583 static void *qemu_hax_cpu_thread_fn(void *arg)
1584 {
1585     CPUState *cpu = arg;
1586     int r;
1587
1588     rcu_register_thread();
1589     qemu_mutex_lock_iothread();
1590     qemu_thread_get_self(cpu->thread);
1591
1592     cpu->thread_id = qemu_get_thread_id();
1593     cpu->created = true;
1594     cpu->halted = 0;
1595     current_cpu = cpu;
1596
1597     hax_init_vcpu(cpu);
1598     qemu_cond_signal(&qemu_cpu_cond);
1599     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1600
1601     do {
1602         if (cpu_can_run(cpu)) {
1603             r = hax_smp_cpu_exec(cpu);
1604             if (r == EXCP_DEBUG) {
1605                 cpu_handle_guest_debug(cpu);
1606             }
1607         }
1608
1609         qemu_wait_io_event(cpu);
1610     } while (!cpu->unplug || cpu_can_run(cpu));
1611     rcu_unregister_thread();
1612     return NULL;
1613 }
1614
1615 /* The HVF-specific vCPU thread function. This one should only run when the host
1616  * CPU supports the VMX "unrestricted guest" feature. */
1617 static void *qemu_hvf_cpu_thread_fn(void *arg)
1618 {
1619     CPUState *cpu = arg;
1620
1621     int r;
1622
1623     assert(hvf_enabled());
1624
1625     rcu_register_thread();
1626
1627     qemu_mutex_lock_iothread();
1628     qemu_thread_get_self(cpu->thread);
1629
1630     cpu->thread_id = qemu_get_thread_id();
1631     cpu->can_do_io = 1;
1632     current_cpu = cpu;
1633
1634     hvf_init_vcpu(cpu);
1635
1636     /* signal CPU creation */
1637     cpu->created = true;
1638     qemu_cond_signal(&qemu_cpu_cond);
1639     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1640
1641     do {
1642         if (cpu_can_run(cpu)) {
1643             r = hvf_vcpu_exec(cpu);
1644             if (r == EXCP_DEBUG) {
1645                 cpu_handle_guest_debug(cpu);
1646             }
1647         }
1648         qemu_wait_io_event(cpu);
1649     } while (!cpu->unplug || cpu_can_run(cpu));
1650
1651     hvf_vcpu_destroy(cpu);
1652     cpu->created = false;
1653     qemu_cond_signal(&qemu_cpu_cond);
1654     qemu_mutex_unlock_iothread();
1655     rcu_unregister_thread();
1656     return NULL;
1657 }
1658
1659 static void *qemu_whpx_cpu_thread_fn(void *arg)
1660 {
1661     CPUState *cpu = arg;
1662     int r;
1663
1664     rcu_register_thread();
1665
1666     qemu_mutex_lock_iothread();
1667     qemu_thread_get_self(cpu->thread);
1668     cpu->thread_id = qemu_get_thread_id();
1669     current_cpu = cpu;
1670
1671     r = whpx_init_vcpu(cpu);
1672     if (r < 0) {
1673         fprintf(stderr, "whpx_init_vcpu failed: %s\n", strerror(-r));
1674         exit(1);
1675     }
1676
1677     /* signal CPU creation */
1678     cpu->created = true;
1679     qemu_cond_signal(&qemu_cpu_cond);
1680     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1681
1682     do {
1683         if (cpu_can_run(cpu)) {
1684             r = whpx_vcpu_exec(cpu);
1685             if (r == EXCP_DEBUG) {
1686                 cpu_handle_guest_debug(cpu);
1687             }
1688         }
1689         while (cpu_thread_is_idle(cpu)) {
1690             qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1691         }
1692         qemu_wait_io_event_common(cpu);
1693     } while (!cpu->unplug || cpu_can_run(cpu));
1694
1695     whpx_destroy_vcpu(cpu);
1696     cpu->created = false;
1697     qemu_cond_signal(&qemu_cpu_cond);
1698     qemu_mutex_unlock_iothread();
1699     rcu_unregister_thread();
1700     return NULL;
1701 }
1702
1703 #ifdef _WIN32
1704 static void CALLBACK dummy_apc_func(ULONG_PTR unused)
1705 {
1706 }
1707 #endif
1708
1709 /* Multi-threaded TCG
1710  *
1711  * In the multi-threaded case each vCPU has its own thread. The TLS
1712  * variable current_cpu can be used deep in the code to find the
1713  * current CPUState for a given thread.
1714  */
1715
1716 static void *qemu_tcg_cpu_thread_fn(void *arg)
1717 {
1718     CPUState *cpu = arg;
1719
1720     assert(tcg_enabled());
1721     g_assert(!use_icount);
1722
1723     rcu_register_thread();
1724     tcg_register_thread();
1725
1726     qemu_mutex_lock_iothread();
1727     qemu_thread_get_self(cpu->thread);
1728
1729     cpu->thread_id = qemu_get_thread_id();
1730     cpu->created = true;
1731     cpu->can_do_io = 1;
1732     current_cpu = cpu;
1733     qemu_cond_signal(&qemu_cpu_cond);
1734     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1735
1736     /* process any pending work */
1737     cpu->exit_request = 1;
1738
1739     do {
1740         if (cpu_can_run(cpu)) {
1741             int r;
1742             qemu_mutex_unlock_iothread();
1743             r = tcg_cpu_exec(cpu);
1744             qemu_mutex_lock_iothread();
1745             switch (r) {
1746             case EXCP_DEBUG:
1747                 cpu_handle_guest_debug(cpu);
1748                 break;
1749             case EXCP_HALTED:
1750                 /* during start-up the vCPU is reset and the thread is
1751                  * kicked several times. If we don't ensure we go back
1752                  * to sleep in the halted state we won't cleanly
1753                  * start-up when the vCPU is enabled.
1754                  *
1755                  * cpu->halted should ensure we sleep in wait_io_event
1756                  */
1757                 g_assert(cpu->halted);
1758                 break;
1759             case EXCP_ATOMIC:
1760                 qemu_mutex_unlock_iothread();
1761                 cpu_exec_step_atomic(cpu);
1762                 qemu_mutex_lock_iothread();
1763             default:
1764                 /* Ignore everything else? */
1765                 break;
1766             }
1767         }
1768
1769         atomic_mb_set(&cpu->exit_request, 0);
1770         qemu_wait_io_event(cpu);
1771     } while (!cpu->unplug || cpu_can_run(cpu));
1772
1773     qemu_tcg_destroy_vcpu(cpu);
1774     cpu->created = false;
1775     qemu_cond_signal(&qemu_cpu_cond);
1776     qemu_mutex_unlock_iothread();
1777     rcu_unregister_thread();
1778     return NULL;
1779 }
1780
1781 static void qemu_cpu_kick_thread(CPUState *cpu)
1782 {
1783 #ifndef _WIN32
1784     int err;
1785
1786     if (cpu->thread_kicked) {
1787         return;
1788     }
1789     cpu->thread_kicked = true;
1790     err = pthread_kill(cpu->thread->thread, SIG_IPI);
1791     if (err && err != ESRCH) {
1792         fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
1793         exit(1);
1794     }
1795 #else /* _WIN32 */
1796     if (!qemu_cpu_is_self(cpu)) {
1797         if (whpx_enabled()) {
1798             whpx_vcpu_kick(cpu);
1799         } else if (!QueueUserAPC(dummy_apc_func, cpu->hThread, 0)) {
1800             fprintf(stderr, "%s: QueueUserAPC failed with error %lu\n",
1801                     __func__, GetLastError());
1802             exit(1);
1803         }
1804     }
1805 #endif
1806 }
1807
1808 void qemu_cpu_kick(CPUState *cpu)
1809 {
1810     qemu_cond_broadcast(cpu->halt_cond);
1811     if (tcg_enabled()) {
1812         cpu_exit(cpu);
1813         /* NOP unless doing single-thread RR */
1814         qemu_cpu_kick_rr_cpu();
1815     } else {
1816         if (hax_enabled()) {
1817             /*
1818              * FIXME: race condition with the exit_request check in
1819              * hax_vcpu_hax_exec
1820              */
1821             cpu->exit_request = 1;
1822         }
1823         qemu_cpu_kick_thread(cpu);
1824     }
1825 }
1826
1827 void qemu_cpu_kick_self(void)
1828 {
1829     assert(current_cpu);
1830     qemu_cpu_kick_thread(current_cpu);
1831 }
1832
1833 bool qemu_cpu_is_self(CPUState *cpu)
1834 {
1835     return qemu_thread_is_self(cpu->thread);
1836 }
1837
1838 bool qemu_in_vcpu_thread(void)
1839 {
1840     return current_cpu && qemu_cpu_is_self(current_cpu);
1841 }
1842
1843 static __thread bool iothread_locked = false;
1844
1845 bool qemu_mutex_iothread_locked(void)
1846 {
1847     return iothread_locked;
1848 }
1849
1850 /*
1851  * The BQL is taken from so many places that it is worth profiling the
1852  * callers directly, instead of funneling them all through a single function.
1853  */
1854 void qemu_mutex_lock_iothread_impl(const char *file, int line)
1855 {
1856     QemuMutexLockFunc bql_lock = atomic_read(&qemu_bql_mutex_lock_func);
1857
1858     g_assert(!qemu_mutex_iothread_locked());
1859     bql_lock(&qemu_global_mutex, file, line);
1860     iothread_locked = true;
1861 }
1862
1863 void qemu_mutex_unlock_iothread(void)
1864 {
1865     g_assert(qemu_mutex_iothread_locked());
1866     iothread_locked = false;
1867     qemu_mutex_unlock(&qemu_global_mutex);
1868 }
1869
1870 static bool all_vcpus_paused(void)
1871 {
1872     CPUState *cpu;
1873
1874     CPU_FOREACH(cpu) {
1875         if (!cpu->stopped) {
1876             return false;
1877         }
1878     }
1879
1880     return true;
1881 }
1882
1883 void pause_all_vcpus(void)
1884 {
1885     CPUState *cpu;
1886
1887     qemu_clock_enable(QEMU_CLOCK_VIRTUAL, false);
1888     CPU_FOREACH(cpu) {
1889         if (qemu_cpu_is_self(cpu)) {
1890             qemu_cpu_stop(cpu, true);
1891         } else {
1892             cpu->stop = true;
1893             qemu_cpu_kick(cpu);
1894         }
1895     }
1896
1897     /* We need to drop the replay_lock so any vCPU threads woken up
1898      * can finish their replay tasks
1899      */
1900     replay_mutex_unlock();
1901
1902     while (!all_vcpus_paused()) {
1903         qemu_cond_wait(&qemu_pause_cond, &qemu_global_mutex);
1904         CPU_FOREACH(cpu) {
1905             qemu_cpu_kick(cpu);
1906         }
1907     }
1908
1909     qemu_mutex_unlock_iothread();
1910     replay_mutex_lock();
1911     qemu_mutex_lock_iothread();
1912 }
1913
1914 void cpu_resume(CPUState *cpu)
1915 {
1916     cpu->stop = false;
1917     cpu->stopped = false;
1918     qemu_cpu_kick(cpu);
1919 }
1920
1921 void resume_all_vcpus(void)
1922 {
1923     CPUState *cpu;
1924
1925     qemu_clock_enable(QEMU_CLOCK_VIRTUAL, true);
1926     CPU_FOREACH(cpu) {
1927         cpu_resume(cpu);
1928     }
1929 }
1930
1931 void cpu_remove_sync(CPUState *cpu)
1932 {
1933     cpu->stop = true;
1934     cpu->unplug = true;
1935     qemu_cpu_kick(cpu);
1936     qemu_mutex_unlock_iothread();
1937     qemu_thread_join(cpu->thread);
1938     qemu_mutex_lock_iothread();
1939 }
1940
1941 /* For temporary buffers for forming a name */
1942 #define VCPU_THREAD_NAME_SIZE 16
1943
1944 static void qemu_tcg_init_vcpu(CPUState *cpu)
1945 {
1946     char thread_name[VCPU_THREAD_NAME_SIZE];
1947     static QemuCond *single_tcg_halt_cond;
1948     static QemuThread *single_tcg_cpu_thread;
1949     static int tcg_region_inited;
1950
1951     assert(tcg_enabled());
1952     /*
1953      * Initialize TCG regions--once. Now is a good time, because:
1954      * (1) TCG's init context, prologue and target globals have been set up.
1955      * (2) qemu_tcg_mttcg_enabled() works now (TCG init code runs before the
1956      *     -accel flag is processed, so the check doesn't work then).
1957      */
1958     if (!tcg_region_inited) {
1959         tcg_region_inited = 1;
1960         tcg_region_init();
1961     }
1962
1963     if (qemu_tcg_mttcg_enabled() || !single_tcg_cpu_thread) {
1964         cpu->thread = g_malloc0(sizeof(QemuThread));
1965         cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1966         qemu_cond_init(cpu->halt_cond);
1967
1968         if (qemu_tcg_mttcg_enabled()) {
1969             /* create a thread per vCPU with TCG (MTTCG) */
1970             parallel_cpus = true;
1971             snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
1972                  cpu->cpu_index);
1973
1974             qemu_thread_create(cpu->thread, thread_name, qemu_tcg_cpu_thread_fn,
1975                                cpu, QEMU_THREAD_JOINABLE);
1976
1977         } else {
1978             /* share a single thread for all cpus with TCG */
1979             snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "ALL CPUs/TCG");
1980             qemu_thread_create(cpu->thread, thread_name,
1981                                qemu_tcg_rr_cpu_thread_fn,
1982                                cpu, QEMU_THREAD_JOINABLE);
1983
1984             single_tcg_halt_cond = cpu->halt_cond;
1985             single_tcg_cpu_thread = cpu->thread;
1986         }
1987 #ifdef _WIN32
1988         cpu->hThread = qemu_thread_get_handle(cpu->thread);
1989 #endif
1990     } else {
1991         /* For non-MTTCG cases we share the thread */
1992         cpu->thread = single_tcg_cpu_thread;
1993         cpu->halt_cond = single_tcg_halt_cond;
1994         cpu->thread_id = first_cpu->thread_id;
1995         cpu->can_do_io = 1;
1996         cpu->created = true;
1997     }
1998 }
1999
2000 static void qemu_hax_start_vcpu(CPUState *cpu)
2001 {
2002     char thread_name[VCPU_THREAD_NAME_SIZE];
2003
2004     cpu->thread = g_malloc0(sizeof(QemuThread));
2005     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2006     qemu_cond_init(cpu->halt_cond);
2007
2008     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HAX",
2009              cpu->cpu_index);
2010     qemu_thread_create(cpu->thread, thread_name, qemu_hax_cpu_thread_fn,
2011                        cpu, QEMU_THREAD_JOINABLE);
2012 #ifdef _WIN32
2013     cpu->hThread = qemu_thread_get_handle(cpu->thread);
2014 #endif
2015 }
2016
2017 static void qemu_kvm_start_vcpu(CPUState *cpu)
2018 {
2019     char thread_name[VCPU_THREAD_NAME_SIZE];
2020
2021     cpu->thread = g_malloc0(sizeof(QemuThread));
2022     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2023     qemu_cond_init(cpu->halt_cond);
2024     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/KVM",
2025              cpu->cpu_index);
2026     qemu_thread_create(cpu->thread, thread_name, qemu_kvm_cpu_thread_fn,
2027                        cpu, QEMU_THREAD_JOINABLE);
2028 }
2029
2030 static void qemu_hvf_start_vcpu(CPUState *cpu)
2031 {
2032     char thread_name[VCPU_THREAD_NAME_SIZE];
2033
2034     /* HVF currently does not support TCG, and only runs in
2035      * unrestricted-guest mode. */
2036     assert(hvf_enabled());
2037
2038     cpu->thread = g_malloc0(sizeof(QemuThread));
2039     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2040     qemu_cond_init(cpu->halt_cond);
2041
2042     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HVF",
2043              cpu->cpu_index);
2044     qemu_thread_create(cpu->thread, thread_name, qemu_hvf_cpu_thread_fn,
2045                        cpu, QEMU_THREAD_JOINABLE);
2046 }
2047
2048 static void qemu_whpx_start_vcpu(CPUState *cpu)
2049 {
2050     char thread_name[VCPU_THREAD_NAME_SIZE];
2051
2052     cpu->thread = g_malloc0(sizeof(QemuThread));
2053     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2054     qemu_cond_init(cpu->halt_cond);
2055     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/WHPX",
2056              cpu->cpu_index);
2057     qemu_thread_create(cpu->thread, thread_name, qemu_whpx_cpu_thread_fn,
2058                        cpu, QEMU_THREAD_JOINABLE);
2059 #ifdef _WIN32
2060     cpu->hThread = qemu_thread_get_handle(cpu->thread);
2061 #endif
2062 }
2063
2064 static void qemu_dummy_start_vcpu(CPUState *cpu)
2065 {
2066     char thread_name[VCPU_THREAD_NAME_SIZE];
2067
2068     cpu->thread = g_malloc0(sizeof(QemuThread));
2069     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2070     qemu_cond_init(cpu->halt_cond);
2071     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/DUMMY",
2072              cpu->cpu_index);
2073     qemu_thread_create(cpu->thread, thread_name, qemu_dummy_cpu_thread_fn, cpu,
2074                        QEMU_THREAD_JOINABLE);
2075 }
2076
2077 void qemu_init_vcpu(CPUState *cpu)
2078 {
2079     cpu->nr_cores = smp_cores;
2080     cpu->nr_threads = smp_threads;
2081     cpu->stopped = true;
2082     cpu->random_seed = qemu_guest_random_seed_thread_part1();
2083
2084     if (!cpu->as) {
2085         /* If the target cpu hasn't set up any address spaces itself,
2086          * give it the default one.
2087          */
2088         cpu->num_ases = 1;
2089         cpu_address_space_init(cpu, 0, "cpu-memory", cpu->memory);
2090     }
2091
2092     if (kvm_enabled()) {
2093         qemu_kvm_start_vcpu(cpu);
2094     } else if (hax_enabled()) {
2095         qemu_hax_start_vcpu(cpu);
2096     } else if (hvf_enabled()) {
2097         qemu_hvf_start_vcpu(cpu);
2098     } else if (tcg_enabled()) {
2099         qemu_tcg_init_vcpu(cpu);
2100     } else if (whpx_enabled()) {
2101         qemu_whpx_start_vcpu(cpu);
2102     } else {
2103         qemu_dummy_start_vcpu(cpu);
2104     }
2105
2106     while (!cpu->created) {
2107         qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
2108     }
2109 }
2110
2111 void cpu_stop_current(void)
2112 {
2113     if (current_cpu) {
2114         current_cpu->stop = true;
2115         cpu_exit(current_cpu);
2116     }
2117 }
2118
2119 int vm_stop(RunState state)
2120 {
2121     if (qemu_in_vcpu_thread()) {
2122         qemu_system_vmstop_request_prepare();
2123         qemu_system_vmstop_request(state);
2124         /*
2125          * FIXME: should not return to device code in case
2126          * vm_stop() has been requested.
2127          */
2128         cpu_stop_current();
2129         return 0;
2130     }
2131
2132     return do_vm_stop(state, true);
2133 }
2134
2135 /**
2136  * Prepare for (re)starting the VM.
2137  * Returns -1 if the vCPUs are not to be restarted (e.g. if they are already
2138  * running or in case of an error condition), 0 otherwise.
2139  */
2140 int vm_prepare_start(void)
2141 {
2142     RunState requested;
2143
2144     qemu_vmstop_requested(&requested);
2145     if (runstate_is_running() && requested == RUN_STATE__MAX) {
2146         return -1;
2147     }
2148
2149     /* Ensure that a STOP/RESUME pair of events is emitted if a
2150      * vmstop request was pending.  The BLOCK_IO_ERROR event, for
2151      * example, according to documentation is always followed by
2152      * the STOP event.
2153      */
2154     if (runstate_is_running()) {
2155         qapi_event_send_stop();
2156         qapi_event_send_resume();
2157         return -1;
2158     }
2159
2160     /* We are sending this now, but the CPUs will be resumed shortly later */
2161     qapi_event_send_resume();
2162
2163     replay_enable_events();
2164     cpu_enable_ticks();
2165     runstate_set(RUN_STATE_RUNNING);
2166     vm_state_notify(1, RUN_STATE_RUNNING);
2167     return 0;
2168 }
2169
2170 void vm_start(void)
2171 {
2172     if (!vm_prepare_start()) {
2173         resume_all_vcpus();
2174     }
2175 }
2176
2177 /* does a state transition even if the VM is already stopped,
2178    current state is forgotten forever */
2179 int vm_stop_force_state(RunState state)
2180 {
2181     if (runstate_is_running()) {
2182         return vm_stop(state);
2183     } else {
2184         runstate_set(state);
2185
2186         bdrv_drain_all();
2187         /* Make sure to return an error if the flush in a previous vm_stop()
2188          * failed. */
2189         return bdrv_flush_all();
2190     }
2191 }
2192
2193 void list_cpus(const char *optarg)
2194 {
2195     /* XXX: implement xxx_cpu_list for targets that still miss it */
2196 #if defined(cpu_list)
2197     cpu_list();
2198 #endif
2199 }
2200
2201 CpuInfoList *qmp_query_cpus(Error **errp)
2202 {
2203     MachineState *ms = MACHINE(qdev_get_machine());
2204     MachineClass *mc = MACHINE_GET_CLASS(ms);
2205     CpuInfoList *head = NULL, *cur_item = NULL;
2206     CPUState *cpu;
2207
2208     CPU_FOREACH(cpu) {
2209         CpuInfoList *info;
2210 #if defined(TARGET_I386)
2211         X86CPU *x86_cpu = X86_CPU(cpu);
2212         CPUX86State *env = &x86_cpu->env;
2213 #elif defined(TARGET_PPC)
2214         PowerPCCPU *ppc_cpu = POWERPC_CPU(cpu);
2215         CPUPPCState *env = &ppc_cpu->env;
2216 #elif defined(TARGET_SPARC)
2217         SPARCCPU *sparc_cpu = SPARC_CPU(cpu);
2218         CPUSPARCState *env = &sparc_cpu->env;
2219 #elif defined(TARGET_RISCV)
2220         RISCVCPU *riscv_cpu = RISCV_CPU(cpu);
2221         CPURISCVState *env = &riscv_cpu->env;
2222 #elif defined(TARGET_MIPS)
2223         MIPSCPU *mips_cpu = MIPS_CPU(cpu);
2224         CPUMIPSState *env = &mips_cpu->env;
2225 #elif defined(TARGET_TRICORE)
2226         TriCoreCPU *tricore_cpu = TRICORE_CPU(cpu);
2227         CPUTriCoreState *env = &tricore_cpu->env;
2228 #elif defined(TARGET_S390X)
2229         S390CPU *s390_cpu = S390_CPU(cpu);
2230         CPUS390XState *env = &s390_cpu->env;
2231 #endif
2232
2233         cpu_synchronize_state(cpu);
2234
2235         info = g_malloc0(sizeof(*info));
2236         info->value = g_malloc0(sizeof(*info->value));
2237         info->value->CPU = cpu->cpu_index;
2238         info->value->current = (cpu == first_cpu);
2239         info->value->halted = cpu->halted;
2240         info->value->qom_path = object_get_canonical_path(OBJECT(cpu));
2241         info->value->thread_id = cpu->thread_id;
2242 #if defined(TARGET_I386)
2243         info->value->arch = CPU_INFO_ARCH_X86;
2244         info->value->u.x86.pc = env->eip + env->segs[R_CS].base;
2245 #elif defined(TARGET_PPC)
2246         info->value->arch = CPU_INFO_ARCH_PPC;
2247         info->value->u.ppc.nip = env->nip;
2248 #elif defined(TARGET_SPARC)
2249         info->value->arch = CPU_INFO_ARCH_SPARC;
2250         info->value->u.q_sparc.pc = env->pc;
2251         info->value->u.q_sparc.npc = env->npc;
2252 #elif defined(TARGET_MIPS)
2253         info->value->arch = CPU_INFO_ARCH_MIPS;
2254         info->value->u.q_mips.PC = env->active_tc.PC;
2255 #elif defined(TARGET_TRICORE)
2256         info->value->arch = CPU_INFO_ARCH_TRICORE;
2257         info->value->u.tricore.PC = env->PC;
2258 #elif defined(TARGET_S390X)
2259         info->value->arch = CPU_INFO_ARCH_S390;
2260         info->value->u.s390.cpu_state = env->cpu_state;
2261 #elif defined(TARGET_RISCV)
2262         info->value->arch = CPU_INFO_ARCH_RISCV;
2263         info->value->u.riscv.pc = env->pc;
2264 #else
2265         info->value->arch = CPU_INFO_ARCH_OTHER;
2266 #endif
2267         info->value->has_props = !!mc->cpu_index_to_instance_props;
2268         if (info->value->has_props) {
2269             CpuInstanceProperties *props;
2270             props = g_malloc0(sizeof(*props));
2271             *props = mc->cpu_index_to_instance_props(ms, cpu->cpu_index);
2272             info->value->props = props;
2273         }
2274
2275         /* XXX: waiting for the qapi to support GSList */
2276         if (!cur_item) {
2277             head = cur_item = info;
2278         } else {
2279             cur_item->next = info;
2280             cur_item = info;
2281         }
2282     }
2283
2284     return head;
2285 }
2286
2287 static CpuInfoArch sysemu_target_to_cpuinfo_arch(SysEmuTarget target)
2288 {
2289     /*
2290      * The @SysEmuTarget -> @CpuInfoArch mapping below is based on the
2291      * TARGET_ARCH -> TARGET_BASE_ARCH mapping in the "configure" script.
2292      */
2293     switch (target) {
2294     case SYS_EMU_TARGET_I386:
2295     case SYS_EMU_TARGET_X86_64:
2296         return CPU_INFO_ARCH_X86;
2297
2298     case SYS_EMU_TARGET_PPC:
2299     case SYS_EMU_TARGET_PPC64:
2300         return CPU_INFO_ARCH_PPC;
2301
2302     case SYS_EMU_TARGET_SPARC:
2303     case SYS_EMU_TARGET_SPARC64:
2304         return CPU_INFO_ARCH_SPARC;
2305
2306     case SYS_EMU_TARGET_MIPS:
2307     case SYS_EMU_TARGET_MIPSEL:
2308     case SYS_EMU_TARGET_MIPS64:
2309     case SYS_EMU_TARGET_MIPS64EL:
2310         return CPU_INFO_ARCH_MIPS;
2311
2312     case SYS_EMU_TARGET_TRICORE:
2313         return CPU_INFO_ARCH_TRICORE;
2314
2315     case SYS_EMU_TARGET_S390X:
2316         return CPU_INFO_ARCH_S390;
2317
2318     case SYS_EMU_TARGET_RISCV32:
2319     case SYS_EMU_TARGET_RISCV64:
2320         return CPU_INFO_ARCH_RISCV;
2321
2322     default:
2323         return CPU_INFO_ARCH_OTHER;
2324     }
2325 }
2326
2327 static void cpustate_to_cpuinfo_s390(CpuInfoS390 *info, const CPUState *cpu)
2328 {
2329 #ifdef TARGET_S390X
2330     S390CPU *s390_cpu = S390_CPU(cpu);
2331     CPUS390XState *env = &s390_cpu->env;
2332
2333     info->cpu_state = env->cpu_state;
2334 #else
2335     abort();
2336 #endif
2337 }
2338
2339 /*
2340  * fast means: we NEVER interrupt vCPU threads to retrieve
2341  * information from KVM.
2342  */
2343 CpuInfoFastList *qmp_query_cpus_fast(Error **errp)
2344 {
2345     MachineState *ms = MACHINE(qdev_get_machine());
2346     MachineClass *mc = MACHINE_GET_CLASS(ms);
2347     CpuInfoFastList *head = NULL, *cur_item = NULL;
2348     SysEmuTarget target = qapi_enum_parse(&SysEmuTarget_lookup, TARGET_NAME,
2349                                           -1, &error_abort);
2350     CPUState *cpu;
2351
2352     CPU_FOREACH(cpu) {
2353         CpuInfoFastList *info = g_malloc0(sizeof(*info));
2354         info->value = g_malloc0(sizeof(*info->value));
2355
2356         info->value->cpu_index = cpu->cpu_index;
2357         info->value->qom_path = object_get_canonical_path(OBJECT(cpu));
2358         info->value->thread_id = cpu->thread_id;
2359
2360         info->value->has_props = !!mc->cpu_index_to_instance_props;
2361         if (info->value->has_props) {
2362             CpuInstanceProperties *props;
2363             props = g_malloc0(sizeof(*props));
2364             *props = mc->cpu_index_to_instance_props(ms, cpu->cpu_index);
2365             info->value->props = props;
2366         }
2367
2368         info->value->arch = sysemu_target_to_cpuinfo_arch(target);
2369         info->value->target = target;
2370         if (target == SYS_EMU_TARGET_S390X) {
2371             cpustate_to_cpuinfo_s390(&info->value->u.s390x, cpu);
2372         }
2373
2374         if (!cur_item) {
2375             head = cur_item = info;
2376         } else {
2377             cur_item->next = info;
2378             cur_item = info;
2379         }
2380     }
2381
2382     return head;
2383 }
2384
2385 void qmp_memsave(int64_t addr, int64_t size, const char *filename,
2386                  bool has_cpu, int64_t cpu_index, Error **errp)
2387 {
2388     FILE *f;
2389     uint32_t l;
2390     CPUState *cpu;
2391     uint8_t buf[1024];
2392     int64_t orig_addr = addr, orig_size = size;
2393
2394     if (!has_cpu) {
2395         cpu_index = 0;
2396     }
2397
2398     cpu = qemu_get_cpu(cpu_index);
2399     if (cpu == NULL) {
2400         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cpu-index",
2401                    "a CPU number");
2402         return;
2403     }
2404
2405     f = fopen(filename, "wb");
2406     if (!f) {
2407         error_setg_file_open(errp, errno, filename);
2408         return;
2409     }
2410
2411     while (size != 0) {
2412         l = sizeof(buf);
2413         if (l > size)
2414             l = size;
2415         if (cpu_memory_rw_debug(cpu, addr, buf, l, 0) != 0) {
2416             error_setg(errp, "Invalid addr 0x%016" PRIx64 "/size %" PRId64
2417                              " specified", orig_addr, orig_size);
2418             goto exit;
2419         }
2420         if (fwrite(buf, 1, l, f) != l) {
2421             error_setg(errp, QERR_IO_ERROR);
2422             goto exit;
2423         }
2424         addr += l;
2425         size -= l;
2426     }
2427
2428 exit:
2429     fclose(f);
2430 }
2431
2432 void qmp_pmemsave(int64_t addr, int64_t size, const char *filename,
2433                   Error **errp)
2434 {
2435     FILE *f;
2436     uint32_t l;
2437     uint8_t buf[1024];
2438
2439     f = fopen(filename, "wb");
2440     if (!f) {
2441         error_setg_file_open(errp, errno, filename);
2442         return;
2443     }
2444
2445     while (size != 0) {
2446         l = sizeof(buf);
2447         if (l > size)
2448             l = size;
2449         cpu_physical_memory_read(addr, buf, l);
2450         if (fwrite(buf, 1, l, f) != l) {
2451             error_setg(errp, QERR_IO_ERROR);
2452             goto exit;
2453         }
2454         addr += l;
2455         size -= l;
2456     }
2457
2458 exit:
2459     fclose(f);
2460 }
2461
2462 void qmp_inject_nmi(Error **errp)
2463 {
2464     nmi_monitor_handle(monitor_get_cpu_index(), errp);
2465 }
2466
2467 void dump_drift_info(void)
2468 {
2469     if (!use_icount) {
2470         return;
2471     }
2472
2473     qemu_printf("Host - Guest clock  %"PRIi64" ms\n",
2474                 (cpu_get_clock() - cpu_get_icount())/SCALE_MS);
2475     if (icount_align_option) {
2476         qemu_printf("Max guest delay     %"PRIi64" ms\n",
2477                     -max_delay / SCALE_MS);
2478         qemu_printf("Max guest advance   %"PRIi64" ms\n",
2479                     max_advance / SCALE_MS);
2480     } else {
2481         qemu_printf("Max guest delay     NA\n");
2482         qemu_printf("Max guest advance   NA\n");
2483     }
2484 }