cpus.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a copy
   7  * of this software and associated documentation files (the "Software"), to deal
   8  * in the Software without restriction, including without limitation the rights
   9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10  * copies of the Software, and to permit persons to whom the Software is
  11  * furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in
  14  * all copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22  * THE SOFTWARE.
  23  */
  24
  25 #include "qemu/osdep.h"
  26 #include "qemu-common.h"
  27 #include "qemu/config-file.h"
  28 #include "migration/vmstate.h"
  29 #include "monitor/monitor.h"
  30 #include "qapi/error.h"
  31 #include "qapi/qapi-commands-misc.h"
  32 #include "qapi/qapi-events-run-state.h"
  33 #include "qapi/qmp/qerror.h"
  34 #include "qemu/error-report.h"
  35 #include "qemu/qemu-print.h"
  36 #include "sysemu/tcg.h"
  37 #include "sysemu/block-backend.h"
  38 #include "exec/gdbstub.h"
  39 #include "sysemu/dma.h"
  40 #include "sysemu/hw_accel.h"
  41 #include "sysemu/kvm.h"
  42 #include "sysemu/hax.h"
  43 #include "sysemu/hvf.h"
  44 #include "sysemu/whpx.h"
  45 #include "exec/exec-all.h"
  46
  47 #include "qemu/thread.h"
  48 #include "qemu/plugin.h"
  49 #include "sysemu/cpus.h"
  50 #include "sysemu/qtest.h"
  51 #include "qemu/main-loop.h"
  52 #include "qemu/option.h"
  53 #include "qemu/bitmap.h"
  54 #include "qemu/seqlock.h"
  55 #include "qemu/guest-random.h"
  56 #include "tcg/tcg.h"
  57 #include "hw/nmi.h"
  58 #include "sysemu/replay.h"
  59 #include "sysemu/runstate.h"
  60 #include "hw/boards.h"
  61 #include "hw/hw.h"
  62
  63 #ifdef CONFIG_LINUX
  64
  65 #include <sys/prctl.h>
  66
  67 #ifndef PR_MCE_KILL
  68 #define PR_MCE_KILL 33
  69 #endif
  70
  71 #ifndef PR_MCE_KILL_SET
  72 #define PR_MCE_KILL_SET 1
  73 #endif
  74
  75 #ifndef PR_MCE_KILL_EARLY
  76 #define PR_MCE_KILL_EARLY 1
  77 #endif
  78
  79 #endif /* CONFIG_LINUX */
  80
  81 static QemuMutex qemu_global_mutex;
  82
  83 int64_t max_delay;
  84 int64_t max_advance;
  85
  86 /* vcpu throttling controls */
  87 static QEMUTimer *throttle_timer;
  88 static unsigned int throttle_percentage;
  89
  90 #define CPU_THROTTLE_PCT_MIN 1
  91 #define CPU_THROTTLE_PCT_MAX 99
  92 #define CPU_THROTTLE_TIMESLICE_NS 10000000
  93
  94 bool cpu_is_stopped(CPUState *cpu)
  95 {
  96     return cpu->stopped || !runstate_is_running();
  97 }
  98
  99 static bool cpu_thread_is_idle(CPUState *cpu)
 100 {
 101     if (cpu->stop || cpu->queued_work_first) {
 102         return false;
 103     }
 104     if (cpu_is_stopped(cpu)) {
 105         return true;
 106     }
 107     if (!cpu->halted || cpu_has_work(cpu) ||
 108         kvm_halt_in_kernel()) {
 109         return false;
 110     }
 111     return true;
 112 }
 113
 114 static bool all_cpu_threads_idle(void)
 115 {
 116     CPUState *cpu;
 117
 118     CPU_FOREACH(cpu) {
 119         if (!cpu_thread_is_idle(cpu)) {
 120             return false;
 121         }
 122     }
 123     return true;
 124 }
 125
 126 /***********************************************************/
 127 /* guest cycle counter */
 128
 129 /* Protected by TimersState seqlock */
 130
 131 static bool icount_sleep = true;
 132 /* Arbitrarily pick 1MIPS as the minimum allowable speed.  */
 133 #define MAX_ICOUNT_SHIFT 10
 134
 135 typedef struct TimersState {
 136     /* Protected by BQL.  */
 137     int64_t cpu_ticks_prev;
 138     int64_t cpu_ticks_offset;
 139
 140     /* Protect fields that can be respectively read outside the
 141      * BQL, and written from multiple threads.
 142      */
 143     QemuSeqLock vm_clock_seqlock;
 144     QemuSpin vm_clock_lock;
 145
 146     int16_t cpu_ticks_enabled;
 147
 148     /* Conversion factor from emulated instructions to virtual clock ticks.  */
 149     int16_t icount_time_shift;
 150
 151     /* Compensate for varying guest execution speed.  */
 152     int64_t qemu_icount_bias;
 153
 154     int64_t vm_clock_warp_start;
 155     int64_t cpu_clock_offset;
 156
 157     /* Only written by TCG thread */
 158     int64_t qemu_icount;
 159
 160     /* for adjusting icount */
 161     QEMUTimer *icount_rt_timer;
 162     QEMUTimer *icount_vm_timer;
 163     QEMUTimer *icount_warp_timer;
 164 } TimersState;
 165
 166 static TimersState timers_state;
 167 bool mttcg_enabled;
 168
 169
 170 /* The current number of executed instructions is based on what we
 171  * originally budgeted minus the current state of the decrementing
 172  * icount counters in extra/u16.low.
 173  */
 174 static int64_t cpu_get_icount_executed(CPUState *cpu)
 175 {
 176     return (cpu->icount_budget -
 177             (cpu_neg(cpu)->icount_decr.u16.low + cpu->icount_extra));
 178 }
 179
 180 /*
 181  * Update the global shared timer_state.qemu_icount to take into
 182  * account executed instructions. This is done by the TCG vCPU
 183  * thread so the main-loop can see time has moved forward.
 184  */
 185 static void cpu_update_icount_locked(CPUState *cpu)
 186 {
 187     int64_t executed = cpu_get_icount_executed(cpu);
 188     cpu->icount_budget -= executed;
 189
 190     atomic_set_i64(&timers_state.qemu_icount,
 191                    timers_state.qemu_icount + executed);
 192 }
 193
 194 /*
 195  * Update the global shared timer_state.qemu_icount to take into
 196  * account executed instructions. This is done by the TCG vCPU
 197  * thread so the main-loop can see time has moved forward.
 198  */
 199 void cpu_update_icount(CPUState *cpu)
 200 {
 201     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 202                        &timers_state.vm_clock_lock);
 203     cpu_update_icount_locked(cpu);
 204     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 205                          &timers_state.vm_clock_lock);
 206 }
 207
 208 static int64_t cpu_get_icount_raw_locked(void)
 209 {
 210     CPUState *cpu = current_cpu;
 211
 212     if (cpu && cpu->running) {
 213         if (!cpu->can_do_io) {
 214             error_report("Bad icount read");
 215             exit(1);
 216         }
 217         /* Take into account what has run */
 218         cpu_update_icount_locked(cpu);
 219     }
 220     /* The read is protected by the seqlock, but needs atomic64 to avoid UB */
 221     return atomic_read_i64(&timers_state.qemu_icount);
 222 }
 223
 224 static int64_t cpu_get_icount_locked(void)
 225 {
 226     int64_t icount = cpu_get_icount_raw_locked();
 227     return atomic_read_i64(&timers_state.qemu_icount_bias) +
 228         cpu_icount_to_ns(icount);
 229 }
 230
 231 int64_t cpu_get_icount_raw(void)
 232 {
 233     int64_t icount;
 234     unsigned start;
 235
 236     do {
 237         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 238         icount = cpu_get_icount_raw_locked();
 239     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 240
 241     return icount;
 242 }
 243
 244 /* Return the virtual CPU time, based on the instruction counter.  */
 245 int64_t cpu_get_icount(void)
 246 {
 247     int64_t icount;
 248     unsigned start;
 249
 250     do {
 251         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 252         icount = cpu_get_icount_locked();
 253     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 254
 255     return icount;
 256 }
 257
 258 int64_t cpu_icount_to_ns(int64_t icount)
 259 {
 260     return icount << atomic_read(&timers_state.icount_time_shift);
 261 }
 262
 263 static int64_t cpu_get_ticks_locked(void)
 264 {
 265     int64_t ticks = timers_state.cpu_ticks_offset;
 266     if (timers_state.cpu_ticks_enabled) {
 267         ticks += cpu_get_host_ticks();
 268     }
 269
 270     if (timers_state.cpu_ticks_prev > ticks) {
 271         /* Non increasing ticks may happen if the host uses software suspend.  */
 272         timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks;
 273         ticks = timers_state.cpu_ticks_prev;
 274     }
 275
 276     timers_state.cpu_ticks_prev = ticks;
 277     return ticks;
 278 }
 279
 280 /* return the time elapsed in VM between vm_start and vm_stop.  Unless
 281  * icount is active, cpu_get_ticks() uses units of the host CPU cycle
 282  * counter.
 283  */
 284 int64_t cpu_get_ticks(void)
 285 {
 286     int64_t ticks;
 287
 288     if (use_icount) {
 289         return cpu_get_icount();
 290     }
 291
 292     qemu_spin_lock(&timers_state.vm_clock_lock);
 293     ticks = cpu_get_ticks_locked();
 294     qemu_spin_unlock(&timers_state.vm_clock_lock);
 295     return ticks;
 296 }
 297
 298 static int64_t cpu_get_clock_locked(void)
 299 {
 300     int64_t time;
 301
 302     time = timers_state.cpu_clock_offset;
 303     if (timers_state.cpu_ticks_enabled) {
 304         time += get_clock();
 305     }
 306
 307     return time;
 308 }
 309
 310 /* Return the monotonic time elapsed in VM, i.e.,
 311  * the time between vm_start and vm_stop
 312  */
 313 int64_t cpu_get_clock(void)
 314 {
 315     int64_t ti;
 316     unsigned start;
 317
 318     do {
 319         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 320         ti = cpu_get_clock_locked();
 321     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 322
 323     return ti;
 324 }
 325
 326 /* enable cpu_get_ticks()
 327  * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
 328  */
 329 void cpu_enable_ticks(void)
 330 {
 331     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 332                        &timers_state.vm_clock_lock);
 333     if (!timers_state.cpu_ticks_enabled) {
 334         timers_state.cpu_ticks_offset -= cpu_get_host_ticks();
 335         timers_state.cpu_clock_offset -= get_clock();
 336         timers_state.cpu_ticks_enabled = 1;
 337     }
 338     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 339                        &timers_state.vm_clock_lock);
 340 }
 341
 342 /* disable cpu_get_ticks() : the clock is stopped. You must not call
 343  * cpu_get_ticks() after that.
 344  * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
 345  */
 346 void cpu_disable_ticks(void)
 347 {
 348     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 349                        &timers_state.vm_clock_lock);
 350     if (timers_state.cpu_ticks_enabled) {
 351         timers_state.cpu_ticks_offset += cpu_get_host_ticks();
 352         timers_state.cpu_clock_offset = cpu_get_clock_locked();
 353         timers_state.cpu_ticks_enabled = 0;
 354     }
 355     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 356                          &timers_state.vm_clock_lock);
 357 }
 358
 359 /* Correlation between real and virtual time is always going to be
 360    fairly approximate, so ignore small variation.
 361    When the guest is idle real and virtual time will be aligned in
 362    the IO wait loop.  */
 363 #define ICOUNT_WOBBLE (NANOSECONDS_PER_SECOND / 10)
 364
 365 static void icount_adjust(void)
 366 {
 367     int64_t cur_time;
 368     int64_t cur_icount;
 369     int64_t delta;
 370
 371     /* Protected by TimersState mutex.  */
 372     static int64_t last_delta;
 373
 374     /* If the VM is not running, then do nothing.  */
 375     if (!runstate_is_running()) {
 376         return;
 377     }
 378
 379     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 380                        &timers_state.vm_clock_lock);
 381     cur_time = cpu_get_clock_locked();
 382     cur_icount = cpu_get_icount_locked();
 383
 384     delta = cur_icount - cur_time;
 385     /* FIXME: This is a very crude algorithm, somewhat prone to oscillation.  */
 386     if (delta > 0
 387         && last_delta + ICOUNT_WOBBLE < delta * 2
 388         && timers_state.icount_time_shift > 0) {
 389         /* The guest is getting too far ahead.  Slow time down.  */
 390         atomic_set(&timers_state.icount_time_shift,
 391                    timers_state.icount_time_shift - 1);
 392     }
 393     if (delta < 0
 394         && last_delta - ICOUNT_WOBBLE > delta * 2
 395         && timers_state.icount_time_shift < MAX_ICOUNT_SHIFT) {
 396         /* The guest is getting too far behind.  Speed time up.  */
 397         atomic_set(&timers_state.icount_time_shift,
 398                    timers_state.icount_time_shift + 1);
 399     }
 400     last_delta = delta;
 401     atomic_set_i64(&timers_state.qemu_icount_bias,
 402                    cur_icount - (timers_state.qemu_icount
 403                                  << timers_state.icount_time_shift));
 404     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 405                          &timers_state.vm_clock_lock);
 406 }
 407
 408 static void icount_adjust_rt(void *opaque)
 409 {
 410     timer_mod(timers_state.icount_rt_timer,
 411               qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
 412     icount_adjust();
 413 }
 414
 415 static void icount_adjust_vm(void *opaque)
 416 {
 417     timer_mod(timers_state.icount_vm_timer,
 418                    qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 419                    NANOSECONDS_PER_SECOND / 10);
 420     icount_adjust();
 421 }
 422
 423 static int64_t qemu_icount_round(int64_t count)
 424 {
 425     int shift = atomic_read(&timers_state.icount_time_shift);
 426     return (count + (1 << shift) - 1) >> shift;
 427 }
 428
 429 static void icount_warp_rt(void)
 430 {
 431     unsigned seq;
 432     int64_t warp_start;
 433
 434     /* The icount_warp_timer is rescheduled soon after vm_clock_warp_start
 435      * changes from -1 to another value, so the race here is okay.
 436      */
 437     do {
 438         seq = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 439         warp_start = timers_state.vm_clock_warp_start;
 440     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, seq));
 441
 442     if (warp_start == -1) {
 443         return;
 444     }
 445
 446     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 447                        &timers_state.vm_clock_lock);
 448     if (runstate_is_running()) {
 449         int64_t clock = REPLAY_CLOCK_LOCKED(REPLAY_CLOCK_VIRTUAL_RT,
 450                                             cpu_get_clock_locked());
 451         int64_t warp_delta;
 452
 453         warp_delta = clock - timers_state.vm_clock_warp_start;
 454         if (use_icount == 2) {
 455             /*
 456              * In adaptive mode, do not let QEMU_CLOCK_VIRTUAL run too
 457              * far ahead of real time.
 458              */
 459             int64_t cur_icount = cpu_get_icount_locked();
 460             int64_t delta = clock - cur_icount;
 461             warp_delta = MIN(warp_delta, delta);
 462         }
 463         atomic_set_i64(&timers_state.qemu_icount_bias,
 464                        timers_state.qemu_icount_bias + warp_delta);
 465     }
 466     timers_state.vm_clock_warp_start = -1;
 467     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 468                        &timers_state.vm_clock_lock);
 469
 470     if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL)) {
 471         qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 472     }
 473 }
 474
 475 static void icount_timer_cb(void *opaque)
 476 {
 477     /* No need for a checkpoint because the timer already synchronizes
 478      * with CHECKPOINT_CLOCK_VIRTUAL_RT.
 479      */
 480     icount_warp_rt();
 481 }
 482
 483 void qtest_clock_warp(int64_t dest)
 484 {
 485     int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 486     AioContext *aio_context;
 487     assert(qtest_enabled());
 488     aio_context = qemu_get_aio_context();
 489     while (clock < dest) {
 490         int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
 491                                                       QEMU_TIMER_ATTR_ALL);
 492         int64_t warp = qemu_soonest_timeout(dest - clock, deadline);
 493
 494         seqlock_write_lock(&timers_state.vm_clock_seqlock,
 495                            &timers_state.vm_clock_lock);
 496         atomic_set_i64(&timers_state.qemu_icount_bias,
 497                        timers_state.qemu_icount_bias + warp);
 498         seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 499                              &timers_state.vm_clock_lock);
 500
 501         qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
 502         timerlist_run_timers(aio_context->tlg.tl[QEMU_CLOCK_VIRTUAL]);
 503         clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 504     }
 505     qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 506 }
 507
 508 void qemu_start_warp_timer(void)
 509 {
 510     int64_t clock;
 511     int64_t deadline;
 512
 513     if (!use_icount) {
 514         return;
 515     }
 516
 517     /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
 518      * do not fire, so computing the deadline does not make sense.
 519      */
 520     if (!runstate_is_running()) {
 521         return;
 522     }
 523
 524     if (replay_mode != REPLAY_MODE_PLAY) {
 525         if (!all_cpu_threads_idle()) {
 526             return;
 527         }
 528
 529         if (qtest_enabled()) {
 530             /* When testing, qtest commands advance icount.  */
 531             return;
 532         }
 533
 534         replay_checkpoint(CHECKPOINT_CLOCK_WARP_START);
 535     } else {
 536         /* warp clock deterministically in record/replay mode */
 537         if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_START)) {
 538             /* vCPU is sleeping and warp can't be started.
 539                It is probably a race condition: notification sent
 540                to vCPU was processed in advance and vCPU went to sleep.
 541                Therefore we have to wake it up for doing someting. */
 542             if (replay_has_checkpoint()) {
 543                 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 544             }
 545             return;
 546         }
 547     }
 548
 549     /* We want to use the earliest deadline from ALL vm_clocks */
 550     clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT);
 551     deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
 552                                           ~QEMU_TIMER_ATTR_EXTERNAL);
 553     if (deadline < 0) {
 554         static bool notified;
 555         if (!icount_sleep && !notified) {
 556             warn_report("icount sleep disabled and no active timers");
 557             notified = true;
 558         }
 559         return;
 560     }
 561
 562     if (deadline > 0) {
 563         /*
 564          * Ensure QEMU_CLOCK_VIRTUAL proceeds even when the virtual CPU goes to
 565          * sleep.  Otherwise, the CPU might be waiting for a future timer
 566          * interrupt to wake it up, but the interrupt never comes because
 567          * the vCPU isn't running any insns and thus doesn't advance the
 568          * QEMU_CLOCK_VIRTUAL.
 569          */
 570         if (!icount_sleep) {
 571             /*
 572              * We never let VCPUs sleep in no sleep icount mode.
 573              * If there is a pending QEMU_CLOCK_VIRTUAL timer we just advance
 574              * to the next QEMU_CLOCK_VIRTUAL event and notify it.
 575              * It is useful when we want a deterministic execution time,
 576              * isolated from host latencies.
 577              */
 578             seqlock_write_lock(&timers_state.vm_clock_seqlock,
 579                                &timers_state.vm_clock_lock);
 580             atomic_set_i64(&timers_state.qemu_icount_bias,
 581                            timers_state.qemu_icount_bias + deadline);
 582             seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 583                                  &timers_state.vm_clock_lock);
 584             qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 585         } else {
 586             /*
 587              * We do stop VCPUs and only advance QEMU_CLOCK_VIRTUAL after some
 588              * "real" time, (related to the time left until the next event) has
 589              * passed. The QEMU_CLOCK_VIRTUAL_RT clock will do this.
 590              * This avoids that the warps are visible externally; for example,
 591              * you will not be sending network packets continuously instead of
 592              * every 100ms.
 593              */
 594             seqlock_write_lock(&timers_state.vm_clock_seqlock,
 595                                &timers_state.vm_clock_lock);
 596             if (timers_state.vm_clock_warp_start == -1
 597                 || timers_state.vm_clock_warp_start > clock) {
 598                 timers_state.vm_clock_warp_start = clock;
 599             }
 600             seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 601                                  &timers_state.vm_clock_lock);
 602             timer_mod_anticipate(timers_state.icount_warp_timer,
 603                                  clock + deadline);
 604         }
 605     } else if (deadline == 0) {
 606         qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 607     }
 608 }
 609
 610 static void qemu_account_warp_timer(void)
 611 {
 612     if (!use_icount || !icount_sleep) {
 613         return;
 614     }
 615
 616     /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
 617      * do not fire, so computing the deadline does not make sense.
 618      */
 619     if (!runstate_is_running()) {
 620         return;
 621     }
 622
 623     /* warp clock deterministically in record/replay mode */
 624     if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_ACCOUNT)) {
 625         return;
 626     }
 627
 628     timer_del(timers_state.icount_warp_timer);
 629     icount_warp_rt();
 630 }
 631
 632 static bool icount_state_needed(void *opaque)
 633 {
 634     return use_icount;
 635 }
 636
 637 static bool warp_timer_state_needed(void *opaque)
 638 {
 639     TimersState *s = opaque;
 640     return s->icount_warp_timer != NULL;
 641 }
 642
 643 static bool adjust_timers_state_needed(void *opaque)
 644 {
 645     TimersState *s = opaque;
 646     return s->icount_rt_timer != NULL;
 647 }
 648
 649 /*
 650  * Subsection for warp timer migration is optional, because may not be created
 651  */
 652 static const VMStateDescription icount_vmstate_warp_timer = {
 653     .name = "timer/icount/warp_timer",
 654     .version_id = 1,
 655     .minimum_version_id = 1,
 656     .needed = warp_timer_state_needed,
 657     .fields = (VMStateField[]) {
 658         VMSTATE_INT64(vm_clock_warp_start, TimersState),
 659         VMSTATE_TIMER_PTR(icount_warp_timer, TimersState),
 660         VMSTATE_END_OF_LIST()
 661     }
 662 };
 663
 664 static const VMStateDescription icount_vmstate_adjust_timers = {
 665     .name = "timer/icount/timers",
 666     .version_id = 1,
 667     .minimum_version_id = 1,
 668     .needed = adjust_timers_state_needed,
 669     .fields = (VMStateField[]) {
 670         VMSTATE_TIMER_PTR(icount_rt_timer, TimersState),
 671         VMSTATE_TIMER_PTR(icount_vm_timer, TimersState),
 672         VMSTATE_END_OF_LIST()
 673     }
 674 };
 675
 676 /*
 677  * This is a subsection for icount migration.
 678  */
 679 static const VMStateDescription icount_vmstate_timers = {
 680     .name = "timer/icount",
 681     .version_id = 1,
 682     .minimum_version_id = 1,
 683     .needed = icount_state_needed,
 684     .fields = (VMStateField[]) {
 685         VMSTATE_INT64(qemu_icount_bias, TimersState),
 686         VMSTATE_INT64(qemu_icount, TimersState),
 687         VMSTATE_END_OF_LIST()
 688     },
 689     .subsections = (const VMStateDescription*[]) {
 690         &icount_vmstate_warp_timer,
 691         &icount_vmstate_adjust_timers,
 692         NULL
 693     }
 694 };
 695
 696 static const VMStateDescription vmstate_timers = {
 697     .name = "timer",
 698     .version_id = 2,
 699     .minimum_version_id = 1,
 700     .fields = (VMStateField[]) {
 701         VMSTATE_INT64(cpu_ticks_offset, TimersState),
 702         VMSTATE_UNUSED(8),
 703         VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2),
 704         VMSTATE_END_OF_LIST()
 705     },
 706     .subsections = (const VMStateDescription*[]) {
 707         &icount_vmstate_timers,
 708         NULL
 709     }
 710 };
 711
 712 static void cpu_throttle_thread(CPUState *cpu, run_on_cpu_data opaque)
 713 {
 714     double pct;
 715     double throttle_ratio;
 716     int64_t sleeptime_ns, endtime_ns;
 717
 718     if (!cpu_throttle_get_percentage()) {
 719         return;
 720     }
 721
 722     pct = (double)cpu_throttle_get_percentage()/100;
 723     throttle_ratio = pct / (1 - pct);
 724     /* Add 1ns to fix double's rounding error (like 0.9999999...) */
 725     sleeptime_ns = (int64_t)(throttle_ratio * CPU_THROTTLE_TIMESLICE_NS + 1);
 726     endtime_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) + sleeptime_ns;
 727     while (sleeptime_ns > 0 && !cpu->stop) {
 728         if (sleeptime_ns > SCALE_MS) {
 729             qemu_cond_timedwait(cpu->halt_cond, &qemu_global_mutex,
 730                                 sleeptime_ns / SCALE_MS);
 731         } else {
 732             qemu_mutex_unlock_iothread();
 733             g_usleep(sleeptime_ns / SCALE_US);
 734             qemu_mutex_lock_iothread();
 735         }
 736         sleeptime_ns = endtime_ns - qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
 737     }
 738     atomic_set(&cpu->throttle_thread_scheduled, 0);
 739 }
 740
 741 static void cpu_throttle_timer_tick(void *opaque)
 742 {
 743     CPUState *cpu;
 744     double pct;
 745
 746     /* Stop the timer if needed */
 747     if (!cpu_throttle_get_percentage()) {
 748         return;
 749     }
 750     CPU_FOREACH(cpu) {
 751         if (!atomic_xchg(&cpu->throttle_thread_scheduled, 1)) {
 752             async_run_on_cpu(cpu, cpu_throttle_thread,
 753                              RUN_ON_CPU_NULL);
 754         }
 755     }
 756
 757     pct = (double)cpu_throttle_get_percentage()/100;
 758     timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
 759                                    CPU_THROTTLE_TIMESLICE_NS / (1-pct));
 760 }
 761
 762 void cpu_throttle_set(int new_throttle_pct)
 763 {
 764     /* Ensure throttle percentage is within valid range */
 765     new_throttle_pct = MIN(new_throttle_pct, CPU_THROTTLE_PCT_MAX);
 766     new_throttle_pct = MAX(new_throttle_pct, CPU_THROTTLE_PCT_MIN);
 767
 768     atomic_set(&throttle_percentage, new_throttle_pct);
 769
 770     timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
 771                                        CPU_THROTTLE_TIMESLICE_NS);
 772 }
 773
 774 void cpu_throttle_stop(void)
 775 {
 776     atomic_set(&throttle_percentage, 0);
 777 }
 778
 779 bool cpu_throttle_active(void)
 780 {
 781     return (cpu_throttle_get_percentage() != 0);
 782 }
 783
 784 int cpu_throttle_get_percentage(void)
 785 {
 786     return atomic_read(&throttle_percentage);
 787 }
 788
 789 void cpu_ticks_init(void)
 790 {
 791     seqlock_init(&timers_state.vm_clock_seqlock);
 792     qemu_spin_init(&timers_state.vm_clock_lock);
 793     vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
 794     throttle_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
 795                                            cpu_throttle_timer_tick, NULL);
 796 }
 797
 798 void configure_icount(QemuOpts *opts, Error **errp)
 799 {
 800     const char *option;
 801     char *rem_str = NULL;
 802
 803     option = qemu_opt_get(opts, "shift");
 804     if (!option) {
 805         if (qemu_opt_get(opts, "align") != NULL) {
 806             error_setg(errp, "Please specify shift option when using align");
 807         }
 808         return;
 809     }
 810
 811     icount_sleep = qemu_opt_get_bool(opts, "sleep", true);
 812     if (icount_sleep) {
 813         timers_state.icount_warp_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
 814                                          icount_timer_cb, NULL);
 815     }
 816
 817     icount_align_option = qemu_opt_get_bool(opts, "align", false);
 818
 819     if (icount_align_option && !icount_sleep) {
 820         error_setg(errp, "align=on and sleep=off are incompatible");
 821     }
 822     if (strcmp(option, "auto") != 0) {
 823         errno = 0;
 824         timers_state.icount_time_shift = strtol(option, &rem_str, 0);
 825         if (errno != 0 || *rem_str != '\0' || !strlen(option)) {
 826             error_setg(errp, "icount: Invalid shift value");
 827         }
 828         use_icount = 1;
 829         return;
 830     } else if (icount_align_option) {
 831         error_setg(errp, "shift=auto and align=on are incompatible");
 832     } else if (!icount_sleep) {
 833         error_setg(errp, "shift=auto and sleep=off are incompatible");
 834     }
 835
 836     use_icount = 2;
 837
 838     /* 125MIPS seems a reasonable initial guess at the guest speed.
 839        It will be corrected fairly quickly anyway.  */
 840     timers_state.icount_time_shift = 3;
 841
 842     /* Have both realtime and virtual time triggers for speed adjustment.
 843        The realtime trigger catches emulated time passing too slowly,
 844        the virtual time trigger catches emulated time passing too fast.
 845        Realtime triggers occur even when idle, so use them less frequently
 846        than VM triggers.  */
 847     timers_state.vm_clock_warp_start = -1;
 848     timers_state.icount_rt_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL_RT,
 849                                    icount_adjust_rt, NULL);
 850     timer_mod(timers_state.icount_rt_timer,
 851                    qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
 852     timers_state.icount_vm_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 853                                         icount_adjust_vm, NULL);
 854     timer_mod(timers_state.icount_vm_timer,
 855                    qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 856                    NANOSECONDS_PER_SECOND / 10);
 857 }
 858
 859 /***********************************************************/
 860 /* TCG vCPU kick timer
 861  *
 862  * The kick timer is responsible for moving single threaded vCPU
 863  * emulation on to the next vCPU. If more than one vCPU is running a
 864  * timer event with force a cpu->exit so the next vCPU can get
 865  * scheduled.
 866  *
 867  * The timer is removed if all vCPUs are idle and restarted again once
 868  * idleness is complete.
 869  */
 870
 871 static QEMUTimer *tcg_kick_vcpu_timer;
 872 static CPUState *tcg_current_rr_cpu;
 873
 874 #define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
 875
 876 static inline int64_t qemu_tcg_next_kick(void)
 877 {
 878     return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + TCG_KICK_PERIOD;
 879 }
 880
 881 /* Kick the currently round-robin scheduled vCPU to next */
 882 static void qemu_cpu_kick_rr_next_cpu(void)
 883 {
 884     CPUState *cpu;
 885     do {
 886         cpu = atomic_mb_read(&tcg_current_rr_cpu);
 887         if (cpu) {
 888             cpu_exit(cpu);
 889         }
 890     } while (cpu != atomic_mb_read(&tcg_current_rr_cpu));
 891 }
 892
 893 /* Kick all RR vCPUs */
 894 static void qemu_cpu_kick_rr_cpus(void)
 895 {
 896     CPUState *cpu;
 897
 898     CPU_FOREACH(cpu) {
 899         cpu_exit(cpu);
 900     };
 901 }
 902
 903 static void do_nothing(CPUState *cpu, run_on_cpu_data unused)
 904 {
 905 }
 906
 907 void qemu_timer_notify_cb(void *opaque, QEMUClockType type)
 908 {
 909     if (!use_icount || type != QEMU_CLOCK_VIRTUAL) {
 910         qemu_notify_event();
 911         return;
 912     }
 913
 914     if (qemu_in_vcpu_thread()) {
 915         /* A CPU is currently running; kick it back out to the
 916          * tcg_cpu_exec() loop so it will recalculate its
 917          * icount deadline immediately.
 918          */
 919         qemu_cpu_kick(current_cpu);
 920     } else if (first_cpu) {
 921         /* qemu_cpu_kick is not enough to kick a halted CPU out of
 922          * qemu_tcg_wait_io_event.  async_run_on_cpu, instead,
 923          * causes cpu_thread_is_idle to return false.  This way,
 924          * handle_icount_deadline can run.
 925          * If we have no CPUs at all for some reason, we don't
 926          * need to do anything.
 927          */
 928         async_run_on_cpu(first_cpu, do_nothing, RUN_ON_CPU_NULL);
 929     }
 930 }
 931
 932 static void kick_tcg_thread(void *opaque)
 933 {
 934     timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 935     qemu_cpu_kick_rr_next_cpu();
 936 }
 937
 938 static void start_tcg_kick_timer(void)
 939 {
 940     assert(!mttcg_enabled);
 941     if (!tcg_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
 942         tcg_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 943                                            kick_tcg_thread, NULL);
 944     }
 945     if (tcg_kick_vcpu_timer && !timer_pending(tcg_kick_vcpu_timer)) {
 946         timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 947     }
 948 }
 949
 950 static void stop_tcg_kick_timer(void)
 951 {
 952     assert(!mttcg_enabled);
 953     if (tcg_kick_vcpu_timer && timer_pending(tcg_kick_vcpu_timer)) {
 954         timer_del(tcg_kick_vcpu_timer);
 955     }
 956 }
 957
 958 /***********************************************************/
 959 void hw_error(const char *fmt, ...)
 960 {
 961     va_list ap;
 962     CPUState *cpu;
 963
 964     va_start(ap, fmt);
 965     fprintf(stderr, "qemu: hardware error: ");
 966     vfprintf(stderr, fmt, ap);
 967     fprintf(stderr, "\n");
 968     CPU_FOREACH(cpu) {
 969         fprintf(stderr, "CPU #%d:\n", cpu->cpu_index);
 970         cpu_dump_state(cpu, stderr, CPU_DUMP_FPU);
 971     }
 972     va_end(ap);
 973     abort();
 974 }
 975
 976 void cpu_synchronize_all_states(void)
 977 {
 978     CPUState *cpu;
 979
 980     CPU_FOREACH(cpu) {
 981         cpu_synchronize_state(cpu);
 982         /* TODO: move to cpu_synchronize_state() */
 983         if (hvf_enabled()) {
 984             hvf_cpu_synchronize_state(cpu);
 985         }
 986     }
 987 }
 988
 989 void cpu_synchronize_all_post_reset(void)
 990 {
 991     CPUState *cpu;
 992
 993     CPU_FOREACH(cpu) {
 994         cpu_synchronize_post_reset(cpu);
 995         /* TODO: move to cpu_synchronize_post_reset() */
 996         if (hvf_enabled()) {
 997             hvf_cpu_synchronize_post_reset(cpu);
 998         }
 999     }
1000 }
1001
1002 void cpu_synchronize_all_post_init(void)
1003 {
1004     CPUState *cpu;
1005
1006     CPU_FOREACH(cpu) {
1007         cpu_synchronize_post_init(cpu);
1008         /* TODO: move to cpu_synchronize_post_init() */
1009         if (hvf_enabled()) {
1010             hvf_cpu_synchronize_post_init(cpu);
1011         }
1012     }
1013 }
1014
1015 void cpu_synchronize_all_pre_loadvm(void)
1016 {
1017     CPUState *cpu;
1018
1019     CPU_FOREACH(cpu) {
1020         cpu_synchronize_pre_loadvm(cpu);
1021     }
1022 }
1023
1024 static int do_vm_stop(RunState state, bool send_stop)
1025 {
1026     int ret = 0;
1027
1028     if (runstate_is_running()) {
1029         runstate_set(state);
1030         cpu_disable_ticks();
1031         pause_all_vcpus();
1032         vm_state_notify(0, state);
1033         if (send_stop) {
1034             qapi_event_send_stop();
1035         }
1036     }
1037
1038     bdrv_drain_all();
1039     ret = bdrv_flush_all();
1040
1041     return ret;
1042 }
1043
1044 /* Special vm_stop() variant for terminating the process.  Historically clients
1045  * did not expect a QMP STOP event and so we need to retain compatibility.
1046  */
1047 int vm_shutdown(void)
1048 {
1049     return do_vm_stop(RUN_STATE_SHUTDOWN, false);
1050 }
1051
1052 static bool cpu_can_run(CPUState *cpu)
1053 {
1054     if (cpu->stop) {
1055         return false;
1056     }
1057     if (cpu_is_stopped(cpu)) {
1058         return false;
1059     }
1060     return true;
1061 }
1062
1063 static void cpu_handle_guest_debug(CPUState *cpu)
1064 {
1065     gdb_set_stop_cpu(cpu);
1066     qemu_system_debug_request();
1067     cpu->stopped = true;
1068 }
1069
1070 #ifdef CONFIG_LINUX
1071 static void sigbus_reraise(void)
1072 {
1073     sigset_t set;
1074     struct sigaction action;
1075
1076     memset(&action, 0, sizeof(action));
1077     action.sa_handler = SIG_DFL;
1078     if (!sigaction(SIGBUS, &action, NULL)) {
1079         raise(SIGBUS);
1080         sigemptyset(&set);
1081         sigaddset(&set, SIGBUS);
1082         pthread_sigmask(SIG_UNBLOCK, &set, NULL);
1083     }
1084     perror("Failed to re-raise SIGBUS!\n");
1085     abort();
1086 }
1087
1088 static void sigbus_handler(int n, siginfo_t *siginfo, void *ctx)
1089 {
1090     if (siginfo->si_code != BUS_MCEERR_AO && siginfo->si_code != BUS_MCEERR_AR) {
1091         sigbus_reraise();
1092     }
1093
1094     if (current_cpu) {
1095         /* Called asynchronously in VCPU thread.  */
1096         if (kvm_on_sigbus_vcpu(current_cpu, siginfo->si_code, siginfo->si_addr)) {
1097             sigbus_reraise();
1098         }
1099     } else {
1100         /* Called synchronously (via signalfd) in main thread.  */
1101         if (kvm_on_sigbus(siginfo->si_code, siginfo->si_addr)) {
1102             sigbus_reraise();
1103         }
1104     }
1105 }
1106
1107 static void qemu_init_sigbus(void)
1108 {
1109     struct sigaction action;
1110
1111     memset(&action, 0, sizeof(action));
1112     action.sa_flags = SA_SIGINFO;
1113     action.sa_sigaction = sigbus_handler;
1114     sigaction(SIGBUS, &action, NULL);
1115
1116     prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0);
1117 }
1118 #else /* !CONFIG_LINUX */
1119 static void qemu_init_sigbus(void)
1120 {
1121 }
1122 #endif /* !CONFIG_LINUX */
1123
1124 static QemuThread io_thread;
1125
1126 /* cpu creation */
1127 static QemuCond qemu_cpu_cond;
1128 /* system init */
1129 static QemuCond qemu_pause_cond;
1130
1131 void qemu_init_cpu_loop(void)
1132 {
1133     qemu_init_sigbus();
1134     qemu_cond_init(&qemu_cpu_cond);
1135     qemu_cond_init(&qemu_pause_cond);
1136     qemu_mutex_init(&qemu_global_mutex);
1137
1138     qemu_thread_get_self(&io_thread);
1139 }
1140
1141 void run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data)
1142 {
1143     do_run_on_cpu(cpu, func, data, &qemu_global_mutex);
1144 }
1145
1146 static void qemu_kvm_destroy_vcpu(CPUState *cpu)
1147 {
1148     if (kvm_destroy_vcpu(cpu) < 0) {
1149         error_report("kvm_destroy_vcpu failed");
1150         exit(EXIT_FAILURE);
1151     }
1152 }
1153
1154 static void qemu_tcg_destroy_vcpu(CPUState *cpu)
1155 {
1156 }
1157
1158 static void qemu_cpu_stop(CPUState *cpu, bool exit)
1159 {
1160     g_assert(qemu_cpu_is_self(cpu));
1161     cpu->stop = false;
1162     cpu->stopped = true;
1163     if (exit) {
1164         cpu_exit(cpu);
1165     }
1166     qemu_cond_broadcast(&qemu_pause_cond);
1167 }
1168
1169 static void qemu_wait_io_event_common(CPUState *cpu)
1170 {
1171     atomic_mb_set(&cpu->thread_kicked, false);
1172     if (cpu->stop) {
1173         qemu_cpu_stop(cpu, false);
1174     }
1175     process_queued_cpu_work(cpu);
1176 }
1177
1178 static void qemu_tcg_rr_wait_io_event(void)
1179 {
1180     CPUState *cpu;
1181
1182     while (all_cpu_threads_idle()) {
1183         stop_tcg_kick_timer();
1184         qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1185     }
1186
1187     start_tcg_kick_timer();
1188
1189     CPU_FOREACH(cpu) {
1190         qemu_wait_io_event_common(cpu);
1191     }
1192 }
1193
1194 static void qemu_wait_io_event(CPUState *cpu)
1195 {
1196     bool slept = false;
1197
1198     while (cpu_thread_is_idle(cpu)) {
1199         if (!slept) {
1200             slept = true;
1201             qemu_plugin_vcpu_idle_cb(cpu);
1202         }
1203         qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1204     }
1205     if (slept) {
1206         qemu_plugin_vcpu_resume_cb(cpu);
1207     }
1208
1209 #ifdef _WIN32
1210     /* Eat dummy APC queued by qemu_cpu_kick_thread.  */
1211     if (!tcg_enabled()) {
1212         SleepEx(0, TRUE);
1213     }
1214 #endif
1215     qemu_wait_io_event_common(cpu);
1216 }
1217
1218 static void *qemu_kvm_cpu_thread_fn(void *arg)
1219 {
1220     CPUState *cpu = arg;
1221     int r;
1222
1223     rcu_register_thread();
1224
1225     qemu_mutex_lock_iothread();
1226     qemu_thread_get_self(cpu->thread);
1227     cpu->thread_id = qemu_get_thread_id();
1228     cpu->can_do_io = 1;
1229     current_cpu = cpu;
1230
1231     r = kvm_init_vcpu(cpu);
1232     if (r < 0) {
1233         error_report("kvm_init_vcpu failed: %s", strerror(-r));
1234         exit(1);
1235     }
1236
1237     kvm_init_cpu_signals(cpu);
1238
1239     /* signal CPU creation */
1240     cpu->created = true;
1241     qemu_cond_signal(&qemu_cpu_cond);
1242     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1243
1244     do {
1245         if (cpu_can_run(cpu)) {
1246             r = kvm_cpu_exec(cpu);
1247             if (r == EXCP_DEBUG) {
1248                 cpu_handle_guest_debug(cpu);
1249             }
1250         }
1251         qemu_wait_io_event(cpu);
1252     } while (!cpu->unplug || cpu_can_run(cpu));
1253
1254     qemu_kvm_destroy_vcpu(cpu);
1255     cpu->created = false;
1256     qemu_cond_signal(&qemu_cpu_cond);
1257     qemu_mutex_unlock_iothread();
1258     rcu_unregister_thread();
1259     return NULL;
1260 }
1261
1262 static void *qemu_dummy_cpu_thread_fn(void *arg)
1263 {
1264 #ifdef _WIN32
1265     error_report("qtest is not supported under Windows");
1266     exit(1);
1267 #else
1268     CPUState *cpu = arg;
1269     sigset_t waitset;
1270     int r;
1271
1272     rcu_register_thread();
1273
1274     qemu_mutex_lock_iothread();
1275     qemu_thread_get_self(cpu->thread);
1276     cpu->thread_id = qemu_get_thread_id();
1277     cpu->can_do_io = 1;
1278     current_cpu = cpu;
1279
1280     sigemptyset(&waitset);
1281     sigaddset(&waitset, SIG_IPI);
1282
1283     /* signal CPU creation */
1284     cpu->created = true;
1285     qemu_cond_signal(&qemu_cpu_cond);
1286     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1287
1288     do {
1289         qemu_mutex_unlock_iothread();
1290         do {
1291             int sig;
1292             r = sigwait(&waitset, &sig);
1293         } while (r == -1 && (errno == EAGAIN || errno == EINTR));
1294         if (r == -1) {
1295             perror("sigwait");
1296             exit(1);
1297         }
1298         qemu_mutex_lock_iothread();
1299         qemu_wait_io_event(cpu);
1300     } while (!cpu->unplug);
1301
1302     qemu_mutex_unlock_iothread();
1303     rcu_unregister_thread();
1304     return NULL;
1305 #endif
1306 }
1307
1308 static int64_t tcg_get_icount_limit(void)
1309 {
1310     int64_t deadline;
1311
1312     if (replay_mode != REPLAY_MODE_PLAY) {
1313         /*
1314          * Include all the timers, because they may need an attention.
1315          * Too long CPU execution may create unnecessary delay in UI.
1316          */
1317         deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
1318                                               QEMU_TIMER_ATTR_ALL);
1319         /* Check realtime timers, because they help with input processing */
1320         deadline = qemu_soonest_timeout(deadline,
1321                 qemu_clock_deadline_ns_all(QEMU_CLOCK_REALTIME,
1322                                            QEMU_TIMER_ATTR_ALL));
1323
1324         /* Maintain prior (possibly buggy) behaviour where if no deadline
1325          * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
1326          * INT32_MAX nanoseconds ahead, we still use INT32_MAX
1327          * nanoseconds.
1328          */
1329         if ((deadline < 0) || (deadline > INT32_MAX)) {
1330             deadline = INT32_MAX;
1331         }
1332
1333         return qemu_icount_round(deadline);
1334     } else {
1335         return replay_get_instructions();
1336     }
1337 }
1338
1339 static void handle_icount_deadline(void)
1340 {
1341     assert(qemu_in_vcpu_thread());
1342     if (use_icount) {
1343         int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
1344                                                       QEMU_TIMER_ATTR_ALL);
1345
1346         if (deadline == 0) {
1347             /* Wake up other AioContexts.  */
1348             qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
1349             qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
1350         }
1351     }
1352 }
1353
1354 static void prepare_icount_for_run(CPUState *cpu)
1355 {
1356     if (use_icount) {
1357         int insns_left;
1358
1359         /* These should always be cleared by process_icount_data after
1360          * each vCPU execution. However u16.high can be raised
1361          * asynchronously by cpu_exit/cpu_interrupt/tcg_handle_interrupt
1362          */
1363         g_assert(cpu_neg(cpu)->icount_decr.u16.low == 0);
1364         g_assert(cpu->icount_extra == 0);
1365
1366         cpu->icount_budget = tcg_get_icount_limit();
1367         insns_left = MIN(0xffff, cpu->icount_budget);
1368         cpu_neg(cpu)->icount_decr.u16.low = insns_left;
1369         cpu->icount_extra = cpu->icount_budget - insns_left;
1370
1371         replay_mutex_lock();
1372     }
1373 }
1374
1375 static void process_icount_data(CPUState *cpu)
1376 {
1377     if (use_icount) {
1378         /* Account for executed instructions */
1379         cpu_update_icount(cpu);
1380
1381         /* Reset the counters */
1382         cpu_neg(cpu)->icount_decr.u16.low = 0;
1383         cpu->icount_extra = 0;
1384         cpu->icount_budget = 0;
1385
1386         replay_account_executed_instructions();
1387
1388         replay_mutex_unlock();
1389     }
1390 }
1391
1392
1393 static int tcg_cpu_exec(CPUState *cpu)
1394 {
1395     int ret;
1396 #ifdef CONFIG_PROFILER
1397     int64_t ti;
1398 #endif
1399
1400     assert(tcg_enabled());
1401 #ifdef CONFIG_PROFILER
1402     ti = profile_getclock();
1403 #endif
1404     cpu_exec_start(cpu);
1405     ret = cpu_exec(cpu);
1406     cpu_exec_end(cpu);
1407 #ifdef CONFIG_PROFILER
1408     atomic_set(&tcg_ctx->prof.cpu_exec_time,
1409                tcg_ctx->prof.cpu_exec_time + profile_getclock() - ti);
1410 #endif
1411     return ret;
1412 }
1413
1414 /* Destroy any remaining vCPUs which have been unplugged and have
1415  * finished running
1416  */
1417 static void deal_with_unplugged_cpus(void)
1418 {
1419     CPUState *cpu;
1420
1421     CPU_FOREACH(cpu) {
1422         if (cpu->unplug && !cpu_can_run(cpu)) {
1423             qemu_tcg_destroy_vcpu(cpu);
1424             cpu->created = false;
1425             qemu_cond_signal(&qemu_cpu_cond);
1426             break;
1427         }
1428     }
1429 }
1430
1431 /* Single-threaded TCG
1432  *
1433  * In the single-threaded case each vCPU is simulated in turn. If
1434  * there is more than a single vCPU we create a simple timer to kick
1435  * the vCPU and ensure we don't get stuck in a tight loop in one vCPU.
1436  * This is done explicitly rather than relying on side-effects
1437  * elsewhere.
1438  */
1439
1440 static void *qemu_tcg_rr_cpu_thread_fn(void *arg)
1441 {
1442     CPUState *cpu = arg;
1443
1444     assert(tcg_enabled());
1445     rcu_register_thread();
1446     tcg_register_thread();
1447
1448     qemu_mutex_lock_iothread();
1449     qemu_thread_get_self(cpu->thread);
1450
1451     cpu->thread_id = qemu_get_thread_id();
1452     cpu->created = true;
1453     cpu->can_do_io = 1;
1454     qemu_cond_signal(&qemu_cpu_cond);
1455     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1456
1457     /* wait for initial kick-off after machine start */
1458     while (first_cpu->stopped) {
1459         qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1460
1461         /* process any pending work */
1462         CPU_FOREACH(cpu) {
1463             current_cpu = cpu;
1464             qemu_wait_io_event_common(cpu);
1465         }
1466     }
1467
1468     start_tcg_kick_timer();
1469
1470     cpu = first_cpu;
1471
1472     /* process any pending work */
1473     cpu->exit_request = 1;
1474
1475     while (1) {
1476         qemu_mutex_unlock_iothread();
1477         replay_mutex_lock();
1478         qemu_mutex_lock_iothread();
1479         /* Account partial waits to QEMU_CLOCK_VIRTUAL.  */
1480         qemu_account_warp_timer();
1481
1482         /* Run the timers here.  This is much more efficient than
1483          * waking up the I/O thread and waiting for completion.
1484          */
1485         handle_icount_deadline();
1486
1487         replay_mutex_unlock();
1488
1489         if (!cpu) {
1490             cpu = first_cpu;
1491         }
1492
1493         while (cpu && !cpu->queued_work_first && !cpu->exit_request) {
1494
1495             atomic_mb_set(&tcg_current_rr_cpu, cpu);
1496             current_cpu = cpu;
1497
1498             qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
1499                               (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
1500
1501             if (cpu_can_run(cpu)) {
1502                 int r;
1503
1504                 qemu_mutex_unlock_iothread();
1505                 prepare_icount_for_run(cpu);
1506
1507                 r = tcg_cpu_exec(cpu);
1508
1509                 process_icount_data(cpu);
1510                 qemu_mutex_lock_iothread();
1511
1512                 if (r == EXCP_DEBUG) {
1513                     cpu_handle_guest_debug(cpu);
1514                     break;
1515                 } else if (r == EXCP_ATOMIC) {
1516                     qemu_mutex_unlock_iothread();
1517                     cpu_exec_step_atomic(cpu);
1518                     qemu_mutex_lock_iothread();
1519                     break;
1520                 }
1521             } else if (cpu->stop) {
1522                 if (cpu->unplug) {
1523                     cpu = CPU_NEXT(cpu);
1524                 }
1525                 break;
1526             }
1527
1528             cpu = CPU_NEXT(cpu);
1529         } /* while (cpu && !cpu->exit_request).. */
1530
1531         /* Does not need atomic_mb_set because a spurious wakeup is okay.  */
1532         atomic_set(&tcg_current_rr_cpu, NULL);
1533
1534         if (cpu && cpu->exit_request) {
1535             atomic_mb_set(&cpu->exit_request, 0);
1536         }
1537
1538         if (use_icount && all_cpu_threads_idle()) {
1539             /*
1540              * When all cpus are sleeping (e.g in WFI), to avoid a deadlock
1541              * in the main_loop, wake it up in order to start the warp timer.
1542              */
1543             qemu_notify_event();
1544         }
1545
1546         qemu_tcg_rr_wait_io_event();
1547         deal_with_unplugged_cpus();
1548     }
1549
1550     rcu_unregister_thread();
1551     return NULL;
1552 }
1553
1554 static void *qemu_hax_cpu_thread_fn(void *arg)
1555 {
1556     CPUState *cpu = arg;
1557     int r;
1558
1559     rcu_register_thread();
1560     qemu_mutex_lock_iothread();
1561     qemu_thread_get_self(cpu->thread);
1562
1563     cpu->thread_id = qemu_get_thread_id();
1564     cpu->created = true;
1565     current_cpu = cpu;
1566
1567     hax_init_vcpu(cpu);
1568     qemu_cond_signal(&qemu_cpu_cond);
1569     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1570
1571     do {
1572         if (cpu_can_run(cpu)) {
1573             r = hax_smp_cpu_exec(cpu);
1574             if (r == EXCP_DEBUG) {
1575                 cpu_handle_guest_debug(cpu);
1576             }
1577         }
1578
1579         qemu_wait_io_event(cpu);
1580     } while (!cpu->unplug || cpu_can_run(cpu));
1581     rcu_unregister_thread();
1582     return NULL;
1583 }
1584
1585 /* The HVF-specific vCPU thread function. This one should only run when the host
1586  * CPU supports the VMX "unrestricted guest" feature. */
1587 static void *qemu_hvf_cpu_thread_fn(void *arg)
1588 {
1589     CPUState *cpu = arg;
1590
1591     int r;
1592
1593     assert(hvf_enabled());
1594
1595     rcu_register_thread();
1596
1597     qemu_mutex_lock_iothread();
1598     qemu_thread_get_self(cpu->thread);
1599
1600     cpu->thread_id = qemu_get_thread_id();
1601     cpu->can_do_io = 1;
1602     current_cpu = cpu;
1603
1604     hvf_init_vcpu(cpu);
1605
1606     /* signal CPU creation */
1607     cpu->created = true;
1608     qemu_cond_signal(&qemu_cpu_cond);
1609     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1610
1611     do {
1612         if (cpu_can_run(cpu)) {
1613             r = hvf_vcpu_exec(cpu);
1614             if (r == EXCP_DEBUG) {
1615                 cpu_handle_guest_debug(cpu);
1616             }
1617         }
1618         qemu_wait_io_event(cpu);
1619     } while (!cpu->unplug || cpu_can_run(cpu));
1620
1621     hvf_vcpu_destroy(cpu);
1622     cpu->created = false;
1623     qemu_cond_signal(&qemu_cpu_cond);
1624     qemu_mutex_unlock_iothread();
1625     rcu_unregister_thread();
1626     return NULL;
1627 }
1628
1629 static void *qemu_whpx_cpu_thread_fn(void *arg)
1630 {
1631     CPUState *cpu = arg;
1632     int r;
1633
1634     rcu_register_thread();
1635
1636     qemu_mutex_lock_iothread();
1637     qemu_thread_get_self(cpu->thread);
1638     cpu->thread_id = qemu_get_thread_id();
1639     current_cpu = cpu;
1640
1641     r = whpx_init_vcpu(cpu);
1642     if (r < 0) {
1643         fprintf(stderr, "whpx_init_vcpu failed: %s\n", strerror(-r));
1644         exit(1);
1645     }
1646
1647     /* signal CPU creation */
1648     cpu->created = true;
1649     qemu_cond_signal(&qemu_cpu_cond);
1650     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1651
1652     do {
1653         if (cpu_can_run(cpu)) {
1654             r = whpx_vcpu_exec(cpu);
1655             if (r == EXCP_DEBUG) {
1656                 cpu_handle_guest_debug(cpu);
1657             }
1658         }
1659         while (cpu_thread_is_idle(cpu)) {
1660             qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1661         }
1662         qemu_wait_io_event_common(cpu);
1663     } while (!cpu->unplug || cpu_can_run(cpu));
1664
1665     whpx_destroy_vcpu(cpu);
1666     cpu->created = false;
1667     qemu_cond_signal(&qemu_cpu_cond);
1668     qemu_mutex_unlock_iothread();
1669     rcu_unregister_thread();
1670     return NULL;
1671 }
1672
1673 #ifdef _WIN32
1674 static void CALLBACK dummy_apc_func(ULONG_PTR unused)
1675 {
1676 }
1677 #endif
1678
1679 /* Multi-threaded TCG
1680  *
1681  * In the multi-threaded case each vCPU has its own thread. The TLS
1682  * variable current_cpu can be used deep in the code to find the
1683  * current CPUState for a given thread.
1684  */
1685
1686 static void *qemu_tcg_cpu_thread_fn(void *arg)
1687 {
1688     CPUState *cpu = arg;
1689
1690     assert(tcg_enabled());
1691     g_assert(!use_icount);
1692
1693     rcu_register_thread();
1694     tcg_register_thread();
1695
1696     qemu_mutex_lock_iothread();
1697     qemu_thread_get_self(cpu->thread);
1698
1699     cpu->thread_id = qemu_get_thread_id();
1700     cpu->created = true;
1701     cpu->can_do_io = 1;
1702     current_cpu = cpu;
1703     qemu_cond_signal(&qemu_cpu_cond);
1704     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1705
1706     /* process any pending work */
1707     cpu->exit_request = 1;
1708
1709     do {
1710         if (cpu_can_run(cpu)) {
1711             int r;
1712             qemu_mutex_unlock_iothread();
1713             r = tcg_cpu_exec(cpu);
1714             qemu_mutex_lock_iothread();
1715             switch (r) {
1716             case EXCP_DEBUG:
1717                 cpu_handle_guest_debug(cpu);
1718                 break;
1719             case EXCP_HALTED:
1720                 /* during start-up the vCPU is reset and the thread is
1721                  * kicked several times. If we don't ensure we go back
1722                  * to sleep in the halted state we won't cleanly
1723                  * start-up when the vCPU is enabled.
1724                  *
1725                  * cpu->halted should ensure we sleep in wait_io_event
1726                  */
1727                 g_assert(cpu->halted);
1728                 break;
1729             case EXCP_ATOMIC:
1730                 qemu_mutex_unlock_iothread();
1731                 cpu_exec_step_atomic(cpu);
1732                 qemu_mutex_lock_iothread();
1733             default:
1734                 /* Ignore everything else? */
1735                 break;
1736             }
1737         }
1738
1739         atomic_mb_set(&cpu->exit_request, 0);
1740         qemu_wait_io_event(cpu);
1741     } while (!cpu->unplug || cpu_can_run(cpu));
1742
1743     qemu_tcg_destroy_vcpu(cpu);
1744     cpu->created = false;
1745     qemu_cond_signal(&qemu_cpu_cond);
1746     qemu_mutex_unlock_iothread();
1747     rcu_unregister_thread();
1748     return NULL;
1749 }
1750
1751 static void qemu_cpu_kick_thread(CPUState *cpu)
1752 {
1753 #ifndef _WIN32
1754     int err;
1755
1756     if (cpu->thread_kicked) {
1757         return;
1758     }
1759     cpu->thread_kicked = true;
1760     err = pthread_kill(cpu->thread->thread, SIG_IPI);
1761     if (err && err != ESRCH) {
1762         fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
1763         exit(1);
1764     }
1765 #else /* _WIN32 */
1766     if (!qemu_cpu_is_self(cpu)) {
1767         if (whpx_enabled()) {
1768             whpx_vcpu_kick(cpu);
1769         } else if (!QueueUserAPC(dummy_apc_func, cpu->hThread, 0)) {
1770             fprintf(stderr, "%s: QueueUserAPC failed with error %lu\n",
1771                     __func__, GetLastError());
1772             exit(1);
1773         }
1774     }
1775 #endif
1776 }
1777
1778 void qemu_cpu_kick(CPUState *cpu)
1779 {
1780     qemu_cond_broadcast(cpu->halt_cond);
1781     if (tcg_enabled()) {
1782         if (qemu_tcg_mttcg_enabled()) {
1783             cpu_exit(cpu);
1784         } else {
1785             qemu_cpu_kick_rr_cpus();
1786         }
1787     } else {
1788         if (hax_enabled()) {
1789             /*
1790              * FIXME: race condition with the exit_request check in
1791              * hax_vcpu_hax_exec
1792              */
1793             cpu->exit_request = 1;
1794         }
1795         qemu_cpu_kick_thread(cpu);
1796     }
1797 }
1798
1799 void qemu_cpu_kick_self(void)
1800 {
1801     assert(current_cpu);
1802     qemu_cpu_kick_thread(current_cpu);
1803 }
1804
1805 bool qemu_cpu_is_self(CPUState *cpu)
1806 {
1807     return qemu_thread_is_self(cpu->thread);
1808 }
1809
1810 bool qemu_in_vcpu_thread(void)
1811 {
1812     return current_cpu && qemu_cpu_is_self(current_cpu);
1813 }
1814
1815 static __thread bool iothread_locked = false;
1816
1817 bool qemu_mutex_iothread_locked(void)
1818 {
1819     return iothread_locked;
1820 }
1821
1822 /*
1823  * The BQL is taken from so many places that it is worth profiling the
1824  * callers directly, instead of funneling them all through a single function.
1825  */
1826 void qemu_mutex_lock_iothread_impl(const char *file, int line)
1827 {
1828     QemuMutexLockFunc bql_lock = atomic_read(&qemu_bql_mutex_lock_func);
1829
1830     g_assert(!qemu_mutex_iothread_locked());
1831     bql_lock(&qemu_global_mutex, file, line);
1832     iothread_locked = true;
1833 }
1834
1835 void qemu_mutex_unlock_iothread(void)
1836 {
1837     g_assert(qemu_mutex_iothread_locked());
1838     iothread_locked = false;
1839     qemu_mutex_unlock(&qemu_global_mutex);
1840 }
1841
1842 void qemu_cond_wait_iothread(QemuCond *cond)
1843 {
1844     qemu_cond_wait(cond, &qemu_global_mutex);
1845 }
1846
1847 static bool all_vcpus_paused(void)
1848 {
1849     CPUState *cpu;
1850
1851     CPU_FOREACH(cpu) {
1852         if (!cpu->stopped) {
1853             return false;
1854         }
1855     }
1856
1857     return true;
1858 }
1859
1860 void pause_all_vcpus(void)
1861 {
1862     CPUState *cpu;
1863
1864     qemu_clock_enable(QEMU_CLOCK_VIRTUAL, false);
1865     CPU_FOREACH(cpu) {
1866         if (qemu_cpu_is_self(cpu)) {
1867             qemu_cpu_stop(cpu, true);
1868         } else {
1869             cpu->stop = true;
1870             qemu_cpu_kick(cpu);
1871         }
1872     }
1873
1874     /* We need to drop the replay_lock so any vCPU threads woken up
1875      * can finish their replay tasks
1876      */
1877     replay_mutex_unlock();
1878
1879     while (!all_vcpus_paused()) {
1880         qemu_cond_wait(&qemu_pause_cond, &qemu_global_mutex);
1881         CPU_FOREACH(cpu) {
1882             qemu_cpu_kick(cpu);
1883         }
1884     }
1885
1886     qemu_mutex_unlock_iothread();
1887     replay_mutex_lock();
1888     qemu_mutex_lock_iothread();
1889 }
1890
1891 void cpu_resume(CPUState *cpu)
1892 {
1893     cpu->stop = false;
1894     cpu->stopped = false;
1895     qemu_cpu_kick(cpu);
1896 }
1897
1898 void resume_all_vcpus(void)
1899 {
1900     CPUState *cpu;
1901
1902     if (!runstate_is_running()) {
1903         return;
1904     }
1905
1906     qemu_clock_enable(QEMU_CLOCK_VIRTUAL, true);
1907     CPU_FOREACH(cpu) {
1908         cpu_resume(cpu);
1909     }
1910 }
1911
1912 void cpu_remove_sync(CPUState *cpu)
1913 {
1914     cpu->stop = true;
1915     cpu->unplug = true;
1916     qemu_cpu_kick(cpu);
1917     qemu_mutex_unlock_iothread();
1918     qemu_thread_join(cpu->thread);
1919     qemu_mutex_lock_iothread();
1920 }
1921
1922 /* For temporary buffers for forming a name */
1923 #define VCPU_THREAD_NAME_SIZE 16
1924
1925 static void qemu_tcg_init_vcpu(CPUState *cpu)
1926 {
1927     char thread_name[VCPU_THREAD_NAME_SIZE];
1928     static QemuCond *single_tcg_halt_cond;
1929     static QemuThread *single_tcg_cpu_thread;
1930     static int tcg_region_inited;
1931
1932     assert(tcg_enabled());
1933     /*
1934      * Initialize TCG regions--once. Now is a good time, because:
1935      * (1) TCG's init context, prologue and target globals have been set up.
1936      * (2) qemu_tcg_mttcg_enabled() works now (TCG init code runs before the
1937      *     -accel flag is processed, so the check doesn't work then).
1938      */
1939     if (!tcg_region_inited) {
1940         tcg_region_inited = 1;
1941         tcg_region_init();
1942     }
1943
1944     if (qemu_tcg_mttcg_enabled() || !single_tcg_cpu_thread) {
1945         cpu->thread = g_malloc0(sizeof(QemuThread));
1946         cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1947         qemu_cond_init(cpu->halt_cond);
1948
1949         if (qemu_tcg_mttcg_enabled()) {
1950             /* create a thread per vCPU with TCG (MTTCG) */
1951             parallel_cpus = true;
1952             snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
1953                  cpu->cpu_index);
1954
1955             qemu_thread_create(cpu->thread, thread_name, qemu_tcg_cpu_thread_fn,
1956                                cpu, QEMU_THREAD_JOINABLE);
1957
1958         } else {
1959             /* share a single thread for all cpus with TCG */
1960             snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "ALL CPUs/TCG");
1961             qemu_thread_create(cpu->thread, thread_name,
1962                                qemu_tcg_rr_cpu_thread_fn,
1963                                cpu, QEMU_THREAD_JOINABLE);
1964
1965             single_tcg_halt_cond = cpu->halt_cond;
1966             single_tcg_cpu_thread = cpu->thread;
1967         }
1968 #ifdef _WIN32
1969         cpu->hThread = qemu_thread_get_handle(cpu->thread);
1970 #endif
1971     } else {
1972         /* For non-MTTCG cases we share the thread */
1973         cpu->thread = single_tcg_cpu_thread;
1974         cpu->halt_cond = single_tcg_halt_cond;
1975         cpu->thread_id = first_cpu->thread_id;
1976         cpu->can_do_io = 1;
1977         cpu->created = true;
1978     }
1979 }
1980
1981 static void qemu_hax_start_vcpu(CPUState *cpu)
1982 {
1983     char thread_name[VCPU_THREAD_NAME_SIZE];
1984
1985     cpu->thread = g_malloc0(sizeof(QemuThread));
1986     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1987     qemu_cond_init(cpu->halt_cond);
1988
1989     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HAX",
1990              cpu->cpu_index);
1991     qemu_thread_create(cpu->thread, thread_name, qemu_hax_cpu_thread_fn,
1992                        cpu, QEMU_THREAD_JOINABLE);
1993 #ifdef _WIN32
1994     cpu->hThread = qemu_thread_get_handle(cpu->thread);
1995 #endif
1996 }
1997
1998 static void qemu_kvm_start_vcpu(CPUState *cpu)
1999 {
2000     char thread_name[VCPU_THREAD_NAME_SIZE];
2001
2002     cpu->thread = g_malloc0(sizeof(QemuThread));
2003     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2004     qemu_cond_init(cpu->halt_cond);
2005     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/KVM",
2006              cpu->cpu_index);
2007     qemu_thread_create(cpu->thread, thread_name, qemu_kvm_cpu_thread_fn,
2008                        cpu, QEMU_THREAD_JOINABLE);
2009 }
2010
2011 static void qemu_hvf_start_vcpu(CPUState *cpu)
2012 {
2013     char thread_name[VCPU_THREAD_NAME_SIZE];
2014
2015     /* HVF currently does not support TCG, and only runs in
2016      * unrestricted-guest mode. */
2017     assert(hvf_enabled());
2018
2019     cpu->thread = g_malloc0(sizeof(QemuThread));
2020     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2021     qemu_cond_init(cpu->halt_cond);
2022
2023     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HVF",
2024              cpu->cpu_index);
2025     qemu_thread_create(cpu->thread, thread_name, qemu_hvf_cpu_thread_fn,
2026                        cpu, QEMU_THREAD_JOINABLE);
2027 }
2028
2029 static void qemu_whpx_start_vcpu(CPUState *cpu)
2030 {
2031     char thread_name[VCPU_THREAD_NAME_SIZE];
2032
2033     cpu->thread = g_malloc0(sizeof(QemuThread));
2034     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2035     qemu_cond_init(cpu->halt_cond);
2036     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/WHPX",
2037              cpu->cpu_index);
2038     qemu_thread_create(cpu->thread, thread_name, qemu_whpx_cpu_thread_fn,
2039                        cpu, QEMU_THREAD_JOINABLE);
2040 #ifdef _WIN32
2041     cpu->hThread = qemu_thread_get_handle(cpu->thread);
2042 #endif
2043 }
2044
2045 static void qemu_dummy_start_vcpu(CPUState *cpu)
2046 {
2047     char thread_name[VCPU_THREAD_NAME_SIZE];
2048
2049     cpu->thread = g_malloc0(sizeof(QemuThread));
2050     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2051     qemu_cond_init(cpu->halt_cond);
2052     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/DUMMY",
2053              cpu->cpu_index);
2054     qemu_thread_create(cpu->thread, thread_name, qemu_dummy_cpu_thread_fn, cpu,
2055                        QEMU_THREAD_JOINABLE);
2056 }
2057
2058 void qemu_init_vcpu(CPUState *cpu)
2059 {
2060     MachineState *ms = MACHINE(qdev_get_machine());
2061
2062     cpu->nr_cores = ms->smp.cores;
2063     cpu->nr_threads =  ms->smp.threads;
2064     cpu->stopped = true;
2065     cpu->random_seed = qemu_guest_random_seed_thread_part1();
2066
2067     if (!cpu->as) {
2068         /* If the target cpu hasn't set up any address spaces itself,
2069          * give it the default one.
2070          */
2071         cpu->num_ases = 1;
2072         cpu_address_space_init(cpu, 0, "cpu-memory", cpu->memory);
2073     }
2074
2075     if (kvm_enabled()) {
2076         qemu_kvm_start_vcpu(cpu);
2077     } else if (hax_enabled()) {
2078         qemu_hax_start_vcpu(cpu);
2079     } else if (hvf_enabled()) {
2080         qemu_hvf_start_vcpu(cpu);
2081     } else if (tcg_enabled()) {
2082         qemu_tcg_init_vcpu(cpu);
2083     } else if (whpx_enabled()) {
2084         qemu_whpx_start_vcpu(cpu);
2085     } else {
2086         qemu_dummy_start_vcpu(cpu);
2087     }
2088
2089     while (!cpu->created) {
2090         qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
2091     }
2092 }
2093
2094 void cpu_stop_current(void)
2095 {
2096     if (current_cpu) {
2097         current_cpu->stop = true;
2098         cpu_exit(current_cpu);
2099     }
2100 }
2101
2102 int vm_stop(RunState state)
2103 {
2104     if (qemu_in_vcpu_thread()) {
2105         qemu_system_vmstop_request_prepare();
2106         qemu_system_vmstop_request(state);
2107         /*
2108          * FIXME: should not return to device code in case
2109          * vm_stop() has been requested.
2110          */
2111         cpu_stop_current();
2112         return 0;
2113     }
2114
2115     return do_vm_stop(state, true);
2116 }
2117
2118 /**
2119  * Prepare for (re)starting the VM.
2120  * Returns -1 if the vCPUs are not to be restarted (e.g. if they are already
2121  * running or in case of an error condition), 0 otherwise.
2122  */
2123 int vm_prepare_start(void)
2124 {
2125     RunState requested;
2126
2127     qemu_vmstop_requested(&requested);
2128     if (runstate_is_running() && requested == RUN_STATE__MAX) {
2129         return -1;
2130     }
2131
2132     /* Ensure that a STOP/RESUME pair of events is emitted if a
2133      * vmstop request was pending.  The BLOCK_IO_ERROR event, for
2134      * example, according to documentation is always followed by
2135      * the STOP event.
2136      */
2137     if (runstate_is_running()) {
2138         qapi_event_send_stop();
2139         qapi_event_send_resume();
2140         return -1;
2141     }
2142
2143     /* We are sending this now, but the CPUs will be resumed shortly later */
2144     qapi_event_send_resume();
2145
2146     cpu_enable_ticks();
2147     runstate_set(RUN_STATE_RUNNING);
2148     vm_state_notify(1, RUN_STATE_RUNNING);
2149     return 0;
2150 }
2151
2152 void vm_start(void)
2153 {
2154     if (!vm_prepare_start()) {
2155         resume_all_vcpus();
2156     }
2157 }
2158
2159 /* does a state transition even if the VM is already stopped,
2160    current state is forgotten forever */
2161 int vm_stop_force_state(RunState state)
2162 {
2163     if (runstate_is_running()) {
2164         return vm_stop(state);
2165     } else {
2166         runstate_set(state);
2167
2168         bdrv_drain_all();
2169         /* Make sure to return an error if the flush in a previous vm_stop()
2170          * failed. */
2171         return bdrv_flush_all();
2172     }
2173 }
2174
2175 void list_cpus(const char *optarg)
2176 {
2177     /* XXX: implement xxx_cpu_list for targets that still miss it */
2178 #if defined(cpu_list)
2179     cpu_list();
2180 #endif
2181 }
2182
2183 void qmp_memsave(int64_t addr, int64_t size, const char *filename,
2184                  bool has_cpu, int64_t cpu_index, Error **errp)
2185 {
2186     FILE *f;
2187     uint32_t l;
2188     CPUState *cpu;
2189     uint8_t buf[1024];
2190     int64_t orig_addr = addr, orig_size = size;
2191
2192     if (!has_cpu) {
2193         cpu_index = 0;
2194     }
2195
2196     cpu = qemu_get_cpu(cpu_index);
2197     if (cpu == NULL) {
2198         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cpu-index",
2199                    "a CPU number");
2200         return;
2201     }
2202
2203     f = fopen(filename, "wb");
2204     if (!f) {
2205         error_setg_file_open(errp, errno, filename);
2206         return;
2207     }
2208
2209     while (size != 0) {
2210         l = sizeof(buf);
2211         if (l > size)
2212             l = size;
2213         if (cpu_memory_rw_debug(cpu, addr, buf, l, 0) != 0) {
2214             error_setg(errp, "Invalid addr 0x%016" PRIx64 "/size %" PRId64
2215                              " specified", orig_addr, orig_size);
2216             goto exit;
2217         }
2218         if (fwrite(buf, 1, l, f) != l) {
2219             error_setg(errp, QERR_IO_ERROR);
2220             goto exit;
2221         }
2222         addr += l;
2223         size -= l;
2224     }
2225
2226 exit:
2227     fclose(f);
2228 }
2229
2230 void qmp_pmemsave(int64_t addr, int64_t size, const char *filename,
2231                   Error **errp)
2232 {
2233     FILE *f;
2234     uint32_t l;
2235     uint8_t buf[1024];
2236
2237     f = fopen(filename, "wb");
2238     if (!f) {
2239         error_setg_file_open(errp, errno, filename);
2240         return;
2241     }
2242
2243     while (size != 0) {
2244         l = sizeof(buf);
2245         if (l > size)
2246             l = size;
2247         cpu_physical_memory_read(addr, buf, l);
2248         if (fwrite(buf, 1, l, f) != l) {
2249             error_setg(errp, QERR_IO_ERROR);
2250             goto exit;
2251         }
2252         addr += l;
2253         size -= l;
2254     }
2255
2256 exit:
2257     fclose(f);
2258 }
2259
2260 void qmp_inject_nmi(Error **errp)
2261 {
2262     nmi_monitor_handle(monitor_get_cpu_index(), errp);
2263 }
2264
2265 void dump_drift_info(void)
2266 {
2267     if (!use_icount) {
2268         return;
2269     }
2270
2271     qemu_printf("Host - Guest clock  %"PRIi64" ms\n",
2272                 (cpu_get_clock() - cpu_get_icount())/SCALE_MS);
2273     if (icount_align_option) {
2274         qemu_printf("Max guest delay     %"PRIi64" ms\n",
2275                     -max_delay / SCALE_MS);
2276         qemu_printf("Max guest advance   %"PRIi64" ms\n",
2277                     max_advance / SCALE_MS);
2278     } else {
2279         qemu_printf("Max guest delay     NA\n");
2280         qemu_printf("Max guest advance   NA\n");
2281     }
2282 }