cpus.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a copy
   7  * of this software and associated documentation files (the "Software"), to deal
   8  * in the Software without restriction, including without limitation the rights
   9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10  * copies of the Software, and to permit persons to whom the Software is
  11  * furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in
  14  * all copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22  * THE SOFTWARE.
  23  */
  24
  25 #include "qemu/osdep.h"
  26 #include "qemu-common.h"
  27 #include "qemu/config-file.h"
  28 #include "qemu/cutils.h"
  29 #include "migration/vmstate.h"
  30 #include "monitor/monitor.h"
  31 #include "qapi/error.h"
  32 #include "qapi/qapi-commands-misc.h"
  33 #include "qapi/qapi-events-run-state.h"
  34 #include "qapi/qmp/qerror.h"
  35 #include "qemu/error-report.h"
  36 #include "qemu/qemu-print.h"
  37 #include "sysemu/tcg.h"
  38 #include "sysemu/block-backend.h"
  39 #include "exec/gdbstub.h"
  40 #include "sysemu/dma.h"
  41 #include "sysemu/hw_accel.h"
  42 #include "sysemu/kvm.h"
  43 #include "sysemu/hax.h"
  44 #include "sysemu/hvf.h"
  45 #include "sysemu/whpx.h"
  46 #include "exec/exec-all.h"
  47
  48 #include "qemu/thread.h"
  49 #include "qemu/plugin.h"
  50 #include "sysemu/cpus.h"
  51 #include "sysemu/qtest.h"
  52 #include "qemu/main-loop.h"
  53 #include "qemu/option.h"
  54 #include "qemu/bitmap.h"
  55 #include "qemu/seqlock.h"
  56 #include "qemu/guest-random.h"
  57 #include "tcg/tcg.h"
  58 #include "hw/nmi.h"
  59 #include "sysemu/replay.h"
  60 #include "sysemu/runstate.h"
  61 #include "hw/boards.h"
  62 #include "hw/hw.h"
  63
  64 #ifdef CONFIG_LINUX
  65
  66 #include <sys/prctl.h>
  67
  68 #ifndef PR_MCE_KILL
  69 #define PR_MCE_KILL 33
  70 #endif
  71
  72 #ifndef PR_MCE_KILL_SET
  73 #define PR_MCE_KILL_SET 1
  74 #endif
  75
  76 #ifndef PR_MCE_KILL_EARLY
  77 #define PR_MCE_KILL_EARLY 1
  78 #endif
  79
  80 #endif /* CONFIG_LINUX */
  81
  82 static QemuMutex qemu_global_mutex;
  83
  84 int64_t max_delay;
  85 int64_t max_advance;
  86
  87 /* vcpu throttling controls */
  88 static QEMUTimer *throttle_timer;
  89 static unsigned int throttle_percentage;
  90
  91 #define CPU_THROTTLE_PCT_MIN 1
  92 #define CPU_THROTTLE_PCT_MAX 99
  93 #define CPU_THROTTLE_TIMESLICE_NS 10000000
  94
  95 bool cpu_is_stopped(CPUState *cpu)
  96 {
  97     return cpu->stopped || !runstate_is_running();
  98 }
  99
 100 static bool cpu_thread_is_idle(CPUState *cpu)
 101 {
 102     if (cpu->stop || cpu->queued_work_first) {
 103         return false;
 104     }
 105     if (cpu_is_stopped(cpu)) {
 106         return true;
 107     }
 108     if (!cpu->halted || cpu_has_work(cpu) ||
 109         kvm_halt_in_kernel()) {
 110         return false;
 111     }
 112     return true;
 113 }
 114
 115 static bool all_cpu_threads_idle(void)
 116 {
 117     CPUState *cpu;
 118
 119     CPU_FOREACH(cpu) {
 120         if (!cpu_thread_is_idle(cpu)) {
 121             return false;
 122         }
 123     }
 124     return true;
 125 }
 126
 127 /***********************************************************/
 128 /* guest cycle counter */
 129
 130 /* Protected by TimersState seqlock */
 131
 132 static bool icount_sleep = true;
 133 /* Arbitrarily pick 1MIPS as the minimum allowable speed.  */
 134 #define MAX_ICOUNT_SHIFT 10
 135
 136 typedef struct TimersState {
 137     /* Protected by BQL.  */
 138     int64_t cpu_ticks_prev;
 139     int64_t cpu_ticks_offset;
 140
 141     /* Protect fields that can be respectively read outside the
 142      * BQL, and written from multiple threads.
 143      */
 144     QemuSeqLock vm_clock_seqlock;
 145     QemuSpin vm_clock_lock;
 146
 147     int16_t cpu_ticks_enabled;
 148
 149     /* Conversion factor from emulated instructions to virtual clock ticks.  */
 150     int16_t icount_time_shift;
 151
 152     /* Compensate for varying guest execution speed.  */
 153     int64_t qemu_icount_bias;
 154
 155     int64_t vm_clock_warp_start;
 156     int64_t cpu_clock_offset;
 157
 158     /* Only written by TCG thread */
 159     int64_t qemu_icount;
 160
 161     /* for adjusting icount */
 162     QEMUTimer *icount_rt_timer;
 163     QEMUTimer *icount_vm_timer;
 164     QEMUTimer *icount_warp_timer;
 165 } TimersState;
 166
 167 static TimersState timers_state;
 168 bool mttcg_enabled;
 169
 170
 171 /* The current number of executed instructions is based on what we
 172  * originally budgeted minus the current state of the decrementing
 173  * icount counters in extra/u16.low.
 174  */
 175 static int64_t cpu_get_icount_executed(CPUState *cpu)
 176 {
 177     return (cpu->icount_budget -
 178             (cpu_neg(cpu)->icount_decr.u16.low + cpu->icount_extra));
 179 }
 180
 181 /*
 182  * Update the global shared timer_state.qemu_icount to take into
 183  * account executed instructions. This is done by the TCG vCPU
 184  * thread so the main-loop can see time has moved forward.
 185  */
 186 static void cpu_update_icount_locked(CPUState *cpu)
 187 {
 188     int64_t executed = cpu_get_icount_executed(cpu);
 189     cpu->icount_budget -= executed;
 190
 191     atomic_set_i64(&timers_state.qemu_icount,
 192                    timers_state.qemu_icount + executed);
 193 }
 194
 195 /*
 196  * Update the global shared timer_state.qemu_icount to take into
 197  * account executed instructions. This is done by the TCG vCPU
 198  * thread so the main-loop can see time has moved forward.
 199  */
 200 void cpu_update_icount(CPUState *cpu)
 201 {
 202     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 203                        &timers_state.vm_clock_lock);
 204     cpu_update_icount_locked(cpu);
 205     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 206                          &timers_state.vm_clock_lock);
 207 }
 208
 209 static int64_t cpu_get_icount_raw_locked(void)
 210 {
 211     CPUState *cpu = current_cpu;
 212
 213     if (cpu && cpu->running) {
 214         if (!cpu->can_do_io) {
 215             error_report("Bad icount read");
 216             exit(1);
 217         }
 218         /* Take into account what has run */
 219         cpu_update_icount_locked(cpu);
 220     }
 221     /* The read is protected by the seqlock, but needs atomic64 to avoid UB */
 222     return atomic_read_i64(&timers_state.qemu_icount);
 223 }
 224
 225 static int64_t cpu_get_icount_locked(void)
 226 {
 227     int64_t icount = cpu_get_icount_raw_locked();
 228     return atomic_read_i64(&timers_state.qemu_icount_bias) +
 229         cpu_icount_to_ns(icount);
 230 }
 231
 232 int64_t cpu_get_icount_raw(void)
 233 {
 234     int64_t icount;
 235     unsigned start;
 236
 237     do {
 238         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 239         icount = cpu_get_icount_raw_locked();
 240     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 241
 242     return icount;
 243 }
 244
 245 /* Return the virtual CPU time, based on the instruction counter.  */
 246 int64_t cpu_get_icount(void)
 247 {
 248     int64_t icount;
 249     unsigned start;
 250
 251     do {
 252         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 253         icount = cpu_get_icount_locked();
 254     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 255
 256     return icount;
 257 }
 258
 259 int64_t cpu_icount_to_ns(int64_t icount)
 260 {
 261     return icount << atomic_read(&timers_state.icount_time_shift);
 262 }
 263
 264 static int64_t cpu_get_ticks_locked(void)
 265 {
 266     int64_t ticks = timers_state.cpu_ticks_offset;
 267     if (timers_state.cpu_ticks_enabled) {
 268         ticks += cpu_get_host_ticks();
 269     }
 270
 271     if (timers_state.cpu_ticks_prev > ticks) {
 272         /* Non increasing ticks may happen if the host uses software suspend.  */
 273         timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks;
 274         ticks = timers_state.cpu_ticks_prev;
 275     }
 276
 277     timers_state.cpu_ticks_prev = ticks;
 278     return ticks;
 279 }
 280
 281 /* return the time elapsed in VM between vm_start and vm_stop.  Unless
 282  * icount is active, cpu_get_ticks() uses units of the host CPU cycle
 283  * counter.
 284  */
 285 int64_t cpu_get_ticks(void)
 286 {
 287     int64_t ticks;
 288
 289     if (use_icount) {
 290         return cpu_get_icount();
 291     }
 292
 293     qemu_spin_lock(&timers_state.vm_clock_lock);
 294     ticks = cpu_get_ticks_locked();
 295     qemu_spin_unlock(&timers_state.vm_clock_lock);
 296     return ticks;
 297 }
 298
 299 static int64_t cpu_get_clock_locked(void)
 300 {
 301     int64_t time;
 302
 303     time = timers_state.cpu_clock_offset;
 304     if (timers_state.cpu_ticks_enabled) {
 305         time += get_clock();
 306     }
 307
 308     return time;
 309 }
 310
 311 /* Return the monotonic time elapsed in VM, i.e.,
 312  * the time between vm_start and vm_stop
 313  */
 314 int64_t cpu_get_clock(void)
 315 {
 316     int64_t ti;
 317     unsigned start;
 318
 319     do {
 320         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 321         ti = cpu_get_clock_locked();
 322     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 323
 324     return ti;
 325 }
 326
 327 /* enable cpu_get_ticks()
 328  * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
 329  */
 330 void cpu_enable_ticks(void)
 331 {
 332     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 333                        &timers_state.vm_clock_lock);
 334     if (!timers_state.cpu_ticks_enabled) {
 335         timers_state.cpu_ticks_offset -= cpu_get_host_ticks();
 336         timers_state.cpu_clock_offset -= get_clock();
 337         timers_state.cpu_ticks_enabled = 1;
 338     }
 339     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 340                        &timers_state.vm_clock_lock);
 341 }
 342
 343 /* disable cpu_get_ticks() : the clock is stopped. You must not call
 344  * cpu_get_ticks() after that.
 345  * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
 346  */
 347 void cpu_disable_ticks(void)
 348 {
 349     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 350                        &timers_state.vm_clock_lock);
 351     if (timers_state.cpu_ticks_enabled) {
 352         timers_state.cpu_ticks_offset += cpu_get_host_ticks();
 353         timers_state.cpu_clock_offset = cpu_get_clock_locked();
 354         timers_state.cpu_ticks_enabled = 0;
 355     }
 356     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 357                          &timers_state.vm_clock_lock);
 358 }
 359
 360 /* Correlation between real and virtual time is always going to be
 361    fairly approximate, so ignore small variation.
 362    When the guest is idle real and virtual time will be aligned in
 363    the IO wait loop.  */
 364 #define ICOUNT_WOBBLE (NANOSECONDS_PER_SECOND / 10)
 365
 366 static void icount_adjust(void)
 367 {
 368     int64_t cur_time;
 369     int64_t cur_icount;
 370     int64_t delta;
 371
 372     /* Protected by TimersState mutex.  */
 373     static int64_t last_delta;
 374
 375     /* If the VM is not running, then do nothing.  */
 376     if (!runstate_is_running()) {
 377         return;
 378     }
 379
 380     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 381                        &timers_state.vm_clock_lock);
 382     cur_time = REPLAY_CLOCK_LOCKED(REPLAY_CLOCK_VIRTUAL_RT,
 383                                    cpu_get_clock_locked());
 384     cur_icount = cpu_get_icount_locked();
 385
 386     delta = cur_icount - cur_time;
 387     /* FIXME: This is a very crude algorithm, somewhat prone to oscillation.  */
 388     if (delta > 0
 389         && last_delta + ICOUNT_WOBBLE < delta * 2
 390         && timers_state.icount_time_shift > 0) {
 391         /* The guest is getting too far ahead.  Slow time down.  */
 392         atomic_set(&timers_state.icount_time_shift,
 393                    timers_state.icount_time_shift - 1);
 394     }
 395     if (delta < 0
 396         && last_delta - ICOUNT_WOBBLE > delta * 2
 397         && timers_state.icount_time_shift < MAX_ICOUNT_SHIFT) {
 398         /* The guest is getting too far behind.  Speed time up.  */
 399         atomic_set(&timers_state.icount_time_shift,
 400                    timers_state.icount_time_shift + 1);
 401     }
 402     last_delta = delta;
 403     atomic_set_i64(&timers_state.qemu_icount_bias,
 404                    cur_icount - (timers_state.qemu_icount
 405                                  << timers_state.icount_time_shift));
 406     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 407                          &timers_state.vm_clock_lock);
 408 }
 409
 410 static void icount_adjust_rt(void *opaque)
 411 {
 412     timer_mod(timers_state.icount_rt_timer,
 413               qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
 414     icount_adjust();
 415 }
 416
 417 static void icount_adjust_vm(void *opaque)
 418 {
 419     timer_mod(timers_state.icount_vm_timer,
 420                    qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 421                    NANOSECONDS_PER_SECOND / 10);
 422     icount_adjust();
 423 }
 424
 425 static int64_t qemu_icount_round(int64_t count)
 426 {
 427     int shift = atomic_read(&timers_state.icount_time_shift);
 428     return (count + (1 << shift) - 1) >> shift;
 429 }
 430
 431 static void icount_warp_rt(void)
 432 {
 433     unsigned seq;
 434     int64_t warp_start;
 435
 436     /* The icount_warp_timer is rescheduled soon after vm_clock_warp_start
 437      * changes from -1 to another value, so the race here is okay.
 438      */
 439     do {
 440         seq = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 441         warp_start = timers_state.vm_clock_warp_start;
 442     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, seq));
 443
 444     if (warp_start == -1) {
 445         return;
 446     }
 447
 448     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 449                        &timers_state.vm_clock_lock);
 450     if (runstate_is_running()) {
 451         int64_t clock = REPLAY_CLOCK_LOCKED(REPLAY_CLOCK_VIRTUAL_RT,
 452                                             cpu_get_clock_locked());
 453         int64_t warp_delta;
 454
 455         warp_delta = clock - timers_state.vm_clock_warp_start;
 456         if (use_icount == 2) {
 457             /*
 458              * In adaptive mode, do not let QEMU_CLOCK_VIRTUAL run too
 459              * far ahead of real time.
 460              */
 461             int64_t cur_icount = cpu_get_icount_locked();
 462             int64_t delta = clock - cur_icount;
 463             warp_delta = MIN(warp_delta, delta);
 464         }
 465         atomic_set_i64(&timers_state.qemu_icount_bias,
 466                        timers_state.qemu_icount_bias + warp_delta);
 467     }
 468     timers_state.vm_clock_warp_start = -1;
 469     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 470                        &timers_state.vm_clock_lock);
 471
 472     if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL)) {
 473         qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 474     }
 475 }
 476
 477 static void icount_timer_cb(void *opaque)
 478 {
 479     /* No need for a checkpoint because the timer already synchronizes
 480      * with CHECKPOINT_CLOCK_VIRTUAL_RT.
 481      */
 482     icount_warp_rt();
 483 }
 484
 485 void qtest_clock_warp(int64_t dest)
 486 {
 487     int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 488     AioContext *aio_context;
 489     assert(qtest_enabled());
 490     aio_context = qemu_get_aio_context();
 491     while (clock < dest) {
 492         int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
 493                                                       QEMU_TIMER_ATTR_ALL);
 494         int64_t warp = qemu_soonest_timeout(dest - clock, deadline);
 495
 496         seqlock_write_lock(&timers_state.vm_clock_seqlock,
 497                            &timers_state.vm_clock_lock);
 498         atomic_set_i64(&timers_state.qemu_icount_bias,
 499                        timers_state.qemu_icount_bias + warp);
 500         seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 501                              &timers_state.vm_clock_lock);
 502
 503         qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
 504         timerlist_run_timers(aio_context->tlg.tl[QEMU_CLOCK_VIRTUAL]);
 505         clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 506     }
 507     qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 508 }
 509
 510 void qemu_start_warp_timer(void)
 511 {
 512     int64_t clock;
 513     int64_t deadline;
 514
 515     if (!use_icount) {
 516         return;
 517     }
 518
 519     /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
 520      * do not fire, so computing the deadline does not make sense.
 521      */
 522     if (!runstate_is_running()) {
 523         return;
 524     }
 525
 526     if (replay_mode != REPLAY_MODE_PLAY) {
 527         if (!all_cpu_threads_idle()) {
 528             return;
 529         }
 530
 531         if (qtest_enabled()) {
 532             /* When testing, qtest commands advance icount.  */
 533             return;
 534         }
 535
 536         replay_checkpoint(CHECKPOINT_CLOCK_WARP_START);
 537     } else {
 538         /* warp clock deterministically in record/replay mode */
 539         if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_START)) {
 540             /* vCPU is sleeping and warp can't be started.
 541                It is probably a race condition: notification sent
 542                to vCPU was processed in advance and vCPU went to sleep.
 543                Therefore we have to wake it up for doing someting. */
 544             if (replay_has_checkpoint()) {
 545                 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 546             }
 547             return;
 548         }
 549     }
 550
 551     /* We want to use the earliest deadline from ALL vm_clocks */
 552     clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT);
 553     deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
 554                                           ~QEMU_TIMER_ATTR_EXTERNAL);
 555     if (deadline < 0) {
 556         static bool notified;
 557         if (!icount_sleep && !notified) {
 558             warn_report("icount sleep disabled and no active timers");
 559             notified = true;
 560         }
 561         return;
 562     }
 563
 564     if (deadline > 0) {
 565         /*
 566          * Ensure QEMU_CLOCK_VIRTUAL proceeds even when the virtual CPU goes to
 567          * sleep.  Otherwise, the CPU might be waiting for a future timer
 568          * interrupt to wake it up, but the interrupt never comes because
 569          * the vCPU isn't running any insns and thus doesn't advance the
 570          * QEMU_CLOCK_VIRTUAL.
 571          */
 572         if (!icount_sleep) {
 573             /*
 574              * We never let VCPUs sleep in no sleep icount mode.
 575              * If there is a pending QEMU_CLOCK_VIRTUAL timer we just advance
 576              * to the next QEMU_CLOCK_VIRTUAL event and notify it.
 577              * It is useful when we want a deterministic execution time,
 578              * isolated from host latencies.
 579              */
 580             seqlock_write_lock(&timers_state.vm_clock_seqlock,
 581                                &timers_state.vm_clock_lock);
 582             atomic_set_i64(&timers_state.qemu_icount_bias,
 583                            timers_state.qemu_icount_bias + deadline);
 584             seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 585                                  &timers_state.vm_clock_lock);
 586             qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 587         } else {
 588             /*
 589              * We do stop VCPUs and only advance QEMU_CLOCK_VIRTUAL after some
 590              * "real" time, (related to the time left until the next event) has
 591              * passed. The QEMU_CLOCK_VIRTUAL_RT clock will do this.
 592              * This avoids that the warps are visible externally; for example,
 593              * you will not be sending network packets continuously instead of
 594              * every 100ms.
 595              */
 596             seqlock_write_lock(&timers_state.vm_clock_seqlock,
 597                                &timers_state.vm_clock_lock);
 598             if (timers_state.vm_clock_warp_start == -1
 599                 || timers_state.vm_clock_warp_start > clock) {
 600                 timers_state.vm_clock_warp_start = clock;
 601             }
 602             seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 603                                  &timers_state.vm_clock_lock);
 604             timer_mod_anticipate(timers_state.icount_warp_timer,
 605                                  clock + deadline);
 606         }
 607     } else if (deadline == 0) {
 608         qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 609     }
 610 }
 611
 612 static void qemu_account_warp_timer(void)
 613 {
 614     if (!use_icount || !icount_sleep) {
 615         return;
 616     }
 617
 618     /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
 619      * do not fire, so computing the deadline does not make sense.
 620      */
 621     if (!runstate_is_running()) {
 622         return;
 623     }
 624
 625     /* warp clock deterministically in record/replay mode */
 626     if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_ACCOUNT)) {
 627         return;
 628     }
 629
 630     timer_del(timers_state.icount_warp_timer);
 631     icount_warp_rt();
 632 }
 633
 634 static bool icount_state_needed(void *opaque)
 635 {
 636     return use_icount;
 637 }
 638
 639 static bool warp_timer_state_needed(void *opaque)
 640 {
 641     TimersState *s = opaque;
 642     return s->icount_warp_timer != NULL;
 643 }
 644
 645 static bool adjust_timers_state_needed(void *opaque)
 646 {
 647     TimersState *s = opaque;
 648     return s->icount_rt_timer != NULL;
 649 }
 650
 651 static bool shift_state_needed(void *opaque)
 652 {
 653     return use_icount == 2;
 654 }
 655
 656 /*
 657  * Subsection for warp timer migration is optional, because may not be created
 658  */
 659 static const VMStateDescription icount_vmstate_warp_timer = {
 660     .name = "timer/icount/warp_timer",
 661     .version_id = 1,
 662     .minimum_version_id = 1,
 663     .needed = warp_timer_state_needed,
 664     .fields = (VMStateField[]) {
 665         VMSTATE_INT64(vm_clock_warp_start, TimersState),
 666         VMSTATE_TIMER_PTR(icount_warp_timer, TimersState),
 667         VMSTATE_END_OF_LIST()
 668     }
 669 };
 670
 671 static const VMStateDescription icount_vmstate_adjust_timers = {
 672     .name = "timer/icount/timers",
 673     .version_id = 1,
 674     .minimum_version_id = 1,
 675     .needed = adjust_timers_state_needed,
 676     .fields = (VMStateField[]) {
 677         VMSTATE_TIMER_PTR(icount_rt_timer, TimersState),
 678         VMSTATE_TIMER_PTR(icount_vm_timer, TimersState),
 679         VMSTATE_END_OF_LIST()
 680     }
 681 };
 682
 683 static const VMStateDescription icount_vmstate_shift = {
 684     .name = "timer/icount/shift",
 685     .version_id = 1,
 686     .minimum_version_id = 1,
 687     .needed = shift_state_needed,
 688     .fields = (VMStateField[]) {
 689         VMSTATE_INT16(icount_time_shift, TimersState),
 690         VMSTATE_END_OF_LIST()
 691     }
 692 };
 693
 694 /*
 695  * This is a subsection for icount migration.
 696  */
 697 static const VMStateDescription icount_vmstate_timers = {
 698     .name = "timer/icount",
 699     .version_id = 1,
 700     .minimum_version_id = 1,
 701     .needed = icount_state_needed,
 702     .fields = (VMStateField[]) {
 703         VMSTATE_INT64(qemu_icount_bias, TimersState),
 704         VMSTATE_INT64(qemu_icount, TimersState),
 705         VMSTATE_END_OF_LIST()
 706     },
 707     .subsections = (const VMStateDescription*[]) {
 708         &icount_vmstate_warp_timer,
 709         &icount_vmstate_adjust_timers,
 710         &icount_vmstate_shift,
 711         NULL
 712     }
 713 };
 714
 715 static const VMStateDescription vmstate_timers = {
 716     .name = "timer",
 717     .version_id = 2,
 718     .minimum_version_id = 1,
 719     .fields = (VMStateField[]) {
 720         VMSTATE_INT64(cpu_ticks_offset, TimersState),
 721         VMSTATE_UNUSED(8),
 722         VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2),
 723         VMSTATE_END_OF_LIST()
 724     },
 725     .subsections = (const VMStateDescription*[]) {
 726         &icount_vmstate_timers,
 727         NULL
 728     }
 729 };
 730
 731 static void cpu_throttle_thread(CPUState *cpu, run_on_cpu_data opaque)
 732 {
 733     double pct;
 734     double throttle_ratio;
 735     int64_t sleeptime_ns, endtime_ns;
 736
 737     if (!cpu_throttle_get_percentage()) {
 738         return;
 739     }
 740
 741     pct = (double)cpu_throttle_get_percentage()/100;
 742     throttle_ratio = pct / (1 - pct);
 743     /* Add 1ns to fix double's rounding error (like 0.9999999...) */
 744     sleeptime_ns = (int64_t)(throttle_ratio * CPU_THROTTLE_TIMESLICE_NS + 1);
 745     endtime_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) + sleeptime_ns;
 746     while (sleeptime_ns > 0 && !cpu->stop) {
 747         if (sleeptime_ns > SCALE_MS) {
 748             qemu_cond_timedwait(cpu->halt_cond, &qemu_global_mutex,
 749                                 sleeptime_ns / SCALE_MS);
 750         } else {
 751             qemu_mutex_unlock_iothread();
 752             g_usleep(sleeptime_ns / SCALE_US);
 753             qemu_mutex_lock_iothread();
 754         }
 755         sleeptime_ns = endtime_ns - qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
 756     }
 757     atomic_set(&cpu->throttle_thread_scheduled, 0);
 758 }
 759
 760 static void cpu_throttle_timer_tick(void *opaque)
 761 {
 762     CPUState *cpu;
 763     double pct;
 764
 765     /* Stop the timer if needed */
 766     if (!cpu_throttle_get_percentage()) {
 767         return;
 768     }
 769     CPU_FOREACH(cpu) {
 770         if (!atomic_xchg(&cpu->throttle_thread_scheduled, 1)) {
 771             async_run_on_cpu(cpu, cpu_throttle_thread,
 772                              RUN_ON_CPU_NULL);
 773         }
 774     }
 775
 776     pct = (double)cpu_throttle_get_percentage()/100;
 777     timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
 778                                    CPU_THROTTLE_TIMESLICE_NS / (1-pct));
 779 }
 780
 781 void cpu_throttle_set(int new_throttle_pct)
 782 {
 783     /* Ensure throttle percentage is within valid range */
 784     new_throttle_pct = MIN(new_throttle_pct, CPU_THROTTLE_PCT_MAX);
 785     new_throttle_pct = MAX(new_throttle_pct, CPU_THROTTLE_PCT_MIN);
 786
 787     atomic_set(&throttle_percentage, new_throttle_pct);
 788
 789     timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
 790                                        CPU_THROTTLE_TIMESLICE_NS);
 791 }
 792
 793 void cpu_throttle_stop(void)
 794 {
 795     atomic_set(&throttle_percentage, 0);
 796 }
 797
 798 bool cpu_throttle_active(void)
 799 {
 800     return (cpu_throttle_get_percentage() != 0);
 801 }
 802
 803 int cpu_throttle_get_percentage(void)
 804 {
 805     return atomic_read(&throttle_percentage);
 806 }
 807
 808 void cpu_ticks_init(void)
 809 {
 810     seqlock_init(&timers_state.vm_clock_seqlock);
 811     qemu_spin_init(&timers_state.vm_clock_lock);
 812     vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
 813     throttle_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
 814                                            cpu_throttle_timer_tick, NULL);
 815 }
 816
 817 void configure_icount(QemuOpts *opts, Error **errp)
 818 {
 819     const char *option = qemu_opt_get(opts, "shift");
 820     bool sleep = qemu_opt_get_bool(opts, "sleep", true);
 821     bool align = qemu_opt_get_bool(opts, "align", false);
 822     long time_shift = -1;
 823
 824     if (!option) {
 825         if (qemu_opt_get(opts, "align") != NULL) {
 826             error_setg(errp, "Please specify shift option when using align");
 827         }
 828         return;
 829     }
 830
 831     if (align && !sleep) {
 832         error_setg(errp, "align=on and sleep=off are incompatible");
 833         return;
 834     }
 835
 836     if (strcmp(option, "auto") != 0) {
 837         if (qemu_strtol(option, NULL, 0, &time_shift) < 0
 838             || time_shift < 0 || time_shift > MAX_ICOUNT_SHIFT) {
 839             error_setg(errp, "icount: Invalid shift value");
 840             return;
 841         }
 842     } else if (icount_align_option) {
 843         error_setg(errp, "shift=auto and align=on are incompatible");
 844         return;
 845     } else if (!icount_sleep) {
 846         error_setg(errp, "shift=auto and sleep=off are incompatible");
 847         return;
 848     }
 849
 850     icount_sleep = sleep;
 851     if (icount_sleep) {
 852         timers_state.icount_warp_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
 853                                          icount_timer_cb, NULL);
 854     }
 855
 856     icount_align_option = align;
 857
 858     if (time_shift >= 0) {
 859         timers_state.icount_time_shift = time_shift;
 860         use_icount = 1;
 861         return;
 862     }
 863
 864     use_icount = 2;
 865
 866     /* 125MIPS seems a reasonable initial guess at the guest speed.
 867        It will be corrected fairly quickly anyway.  */
 868     timers_state.icount_time_shift = 3;
 869
 870     /* Have both realtime and virtual time triggers for speed adjustment.
 871        The realtime trigger catches emulated time passing too slowly,
 872        the virtual time trigger catches emulated time passing too fast.
 873        Realtime triggers occur even when idle, so use them less frequently
 874        than VM triggers.  */
 875     timers_state.vm_clock_warp_start = -1;
 876     timers_state.icount_rt_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL_RT,
 877                                    icount_adjust_rt, NULL);
 878     timer_mod(timers_state.icount_rt_timer,
 879                    qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
 880     timers_state.icount_vm_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 881                                         icount_adjust_vm, NULL);
 882     timer_mod(timers_state.icount_vm_timer,
 883                    qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 884                    NANOSECONDS_PER_SECOND / 10);
 885 }
 886
 887 /***********************************************************/
 888 /* TCG vCPU kick timer
 889  *
 890  * The kick timer is responsible for moving single threaded vCPU
 891  * emulation on to the next vCPU. If more than one vCPU is running a
 892  * timer event with force a cpu->exit so the next vCPU can get
 893  * scheduled.
 894  *
 895  * The timer is removed if all vCPUs are idle and restarted again once
 896  * idleness is complete.
 897  */
 898
 899 static QEMUTimer *tcg_kick_vcpu_timer;
 900 static CPUState *tcg_current_rr_cpu;
 901
 902 #define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
 903
 904 static inline int64_t qemu_tcg_next_kick(void)
 905 {
 906     return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + TCG_KICK_PERIOD;
 907 }
 908
 909 /* Kick the currently round-robin scheduled vCPU to next */
 910 static void qemu_cpu_kick_rr_next_cpu(void)
 911 {
 912     CPUState *cpu;
 913     do {
 914         cpu = atomic_mb_read(&tcg_current_rr_cpu);
 915         if (cpu) {
 916             cpu_exit(cpu);
 917         }
 918     } while (cpu != atomic_mb_read(&tcg_current_rr_cpu));
 919 }
 920
 921 /* Kick all RR vCPUs */
 922 static void qemu_cpu_kick_rr_cpus(void)
 923 {
 924     CPUState *cpu;
 925
 926     CPU_FOREACH(cpu) {
 927         cpu_exit(cpu);
 928     };
 929 }
 930
 931 static void do_nothing(CPUState *cpu, run_on_cpu_data unused)
 932 {
 933 }
 934
 935 void qemu_timer_notify_cb(void *opaque, QEMUClockType type)
 936 {
 937     if (!use_icount || type != QEMU_CLOCK_VIRTUAL) {
 938         qemu_notify_event();
 939         return;
 940     }
 941
 942     if (qemu_in_vcpu_thread()) {
 943         /* A CPU is currently running; kick it back out to the
 944          * tcg_cpu_exec() loop so it will recalculate its
 945          * icount deadline immediately.
 946          */
 947         qemu_cpu_kick(current_cpu);
 948     } else if (first_cpu) {
 949         /* qemu_cpu_kick is not enough to kick a halted CPU out of
 950          * qemu_tcg_wait_io_event.  async_run_on_cpu, instead,
 951          * causes cpu_thread_is_idle to return false.  This way,
 952          * handle_icount_deadline can run.
 953          * If we have no CPUs at all for some reason, we don't
 954          * need to do anything.
 955          */
 956         async_run_on_cpu(first_cpu, do_nothing, RUN_ON_CPU_NULL);
 957     }
 958 }
 959
 960 static void kick_tcg_thread(void *opaque)
 961 {
 962     timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 963     qemu_cpu_kick_rr_next_cpu();
 964 }
 965
 966 static void start_tcg_kick_timer(void)
 967 {
 968     assert(!mttcg_enabled);
 969     if (!tcg_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
 970         tcg_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 971                                            kick_tcg_thread, NULL);
 972     }
 973     if (tcg_kick_vcpu_timer && !timer_pending(tcg_kick_vcpu_timer)) {
 974         timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 975     }
 976 }
 977
 978 static void stop_tcg_kick_timer(void)
 979 {
 980     assert(!mttcg_enabled);
 981     if (tcg_kick_vcpu_timer && timer_pending(tcg_kick_vcpu_timer)) {
 982         timer_del(tcg_kick_vcpu_timer);
 983     }
 984 }
 985
 986 /***********************************************************/
 987 void hw_error(const char *fmt, ...)
 988 {
 989     va_list ap;
 990     CPUState *cpu;
 991
 992     va_start(ap, fmt);
 993     fprintf(stderr, "qemu: hardware error: ");
 994     vfprintf(stderr, fmt, ap);
 995     fprintf(stderr, "\n");
 996     CPU_FOREACH(cpu) {
 997         fprintf(stderr, "CPU #%d:\n", cpu->cpu_index);
 998         cpu_dump_state(cpu, stderr, CPU_DUMP_FPU);
 999     }
1000     va_end(ap);
1001     abort();
1002 }
1003
1004 void cpu_synchronize_all_states(void)
1005 {
1006     CPUState *cpu;
1007
1008     CPU_FOREACH(cpu) {
1009         cpu_synchronize_state(cpu);
1010         /* TODO: move to cpu_synchronize_state() */
1011         if (hvf_enabled()) {
1012             hvf_cpu_synchronize_state(cpu);
1013         }
1014     }
1015 }
1016
1017 void cpu_synchronize_all_post_reset(void)
1018 {
1019     CPUState *cpu;
1020
1021     CPU_FOREACH(cpu) {
1022         cpu_synchronize_post_reset(cpu);
1023         /* TODO: move to cpu_synchronize_post_reset() */
1024         if (hvf_enabled()) {
1025             hvf_cpu_synchronize_post_reset(cpu);
1026         }
1027     }
1028 }
1029
1030 void cpu_synchronize_all_post_init(void)
1031 {
1032     CPUState *cpu;
1033
1034     CPU_FOREACH(cpu) {
1035         cpu_synchronize_post_init(cpu);
1036         /* TODO: move to cpu_synchronize_post_init() */
1037         if (hvf_enabled()) {
1038             hvf_cpu_synchronize_post_init(cpu);
1039         }
1040     }
1041 }
1042
1043 void cpu_synchronize_all_pre_loadvm(void)
1044 {
1045     CPUState *cpu;
1046
1047     CPU_FOREACH(cpu) {
1048         cpu_synchronize_pre_loadvm(cpu);
1049     }
1050 }
1051
1052 static int do_vm_stop(RunState state, bool send_stop)
1053 {
1054     int ret = 0;
1055
1056     if (runstate_is_running()) {
1057         runstate_set(state);
1058         cpu_disable_ticks();
1059         pause_all_vcpus();
1060         vm_state_notify(0, state);
1061         if (send_stop) {
1062             qapi_event_send_stop();
1063         }
1064     }
1065
1066     bdrv_drain_all();
1067     ret = bdrv_flush_all();
1068
1069     return ret;
1070 }
1071
1072 /* Special vm_stop() variant for terminating the process.  Historically clients
1073  * did not expect a QMP STOP event and so we need to retain compatibility.
1074  */
1075 int vm_shutdown(void)
1076 {
1077     return do_vm_stop(RUN_STATE_SHUTDOWN, false);
1078 }
1079
1080 static bool cpu_can_run(CPUState *cpu)
1081 {
1082     if (cpu->stop) {
1083         return false;
1084     }
1085     if (cpu_is_stopped(cpu)) {
1086         return false;
1087     }
1088     return true;
1089 }
1090
1091 static void cpu_handle_guest_debug(CPUState *cpu)
1092 {
1093     gdb_set_stop_cpu(cpu);
1094     qemu_system_debug_request();
1095     cpu->stopped = true;
1096 }
1097
1098 #ifdef CONFIG_LINUX
1099 static void sigbus_reraise(void)
1100 {
1101     sigset_t set;
1102     struct sigaction action;
1103
1104     memset(&action, 0, sizeof(action));
1105     action.sa_handler = SIG_DFL;
1106     if (!sigaction(SIGBUS, &action, NULL)) {
1107         raise(SIGBUS);
1108         sigemptyset(&set);
1109         sigaddset(&set, SIGBUS);
1110         pthread_sigmask(SIG_UNBLOCK, &set, NULL);
1111     }
1112     perror("Failed to re-raise SIGBUS!\n");
1113     abort();
1114 }
1115
1116 static void sigbus_handler(int n, siginfo_t *siginfo, void *ctx)
1117 {
1118     if (siginfo->si_code != BUS_MCEERR_AO && siginfo->si_code != BUS_MCEERR_AR) {
1119         sigbus_reraise();
1120     }
1121
1122     if (current_cpu) {
1123         /* Called asynchronously in VCPU thread.  */
1124         if (kvm_on_sigbus_vcpu(current_cpu, siginfo->si_code, siginfo->si_addr)) {
1125             sigbus_reraise();
1126         }
1127     } else {
1128         /* Called synchronously (via signalfd) in main thread.  */
1129         if (kvm_on_sigbus(siginfo->si_code, siginfo->si_addr)) {
1130             sigbus_reraise();
1131         }
1132     }
1133 }
1134
1135 static void qemu_init_sigbus(void)
1136 {
1137     struct sigaction action;
1138
1139     memset(&action, 0, sizeof(action));
1140     action.sa_flags = SA_SIGINFO;
1141     action.sa_sigaction = sigbus_handler;
1142     sigaction(SIGBUS, &action, NULL);
1143
1144     prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0);
1145 }
1146 #else /* !CONFIG_LINUX */
1147 static void qemu_init_sigbus(void)
1148 {
1149 }
1150 #endif /* !CONFIG_LINUX */
1151
1152 static QemuThread io_thread;
1153
1154 /* cpu creation */
1155 static QemuCond qemu_cpu_cond;
1156 /* system init */
1157 static QemuCond qemu_pause_cond;
1158
1159 void qemu_init_cpu_loop(void)
1160 {
1161     qemu_init_sigbus();
1162     qemu_cond_init(&qemu_cpu_cond);
1163     qemu_cond_init(&qemu_pause_cond);
1164     qemu_mutex_init(&qemu_global_mutex);
1165
1166     qemu_thread_get_self(&io_thread);
1167 }
1168
1169 void run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data)
1170 {
1171     do_run_on_cpu(cpu, func, data, &qemu_global_mutex);
1172 }
1173
1174 static void qemu_kvm_destroy_vcpu(CPUState *cpu)
1175 {
1176     if (kvm_destroy_vcpu(cpu) < 0) {
1177         error_report("kvm_destroy_vcpu failed");
1178         exit(EXIT_FAILURE);
1179     }
1180 }
1181
1182 static void qemu_tcg_destroy_vcpu(CPUState *cpu)
1183 {
1184 }
1185
1186 static void qemu_cpu_stop(CPUState *cpu, bool exit)
1187 {
1188     g_assert(qemu_cpu_is_self(cpu));
1189     cpu->stop = false;
1190     cpu->stopped = true;
1191     if (exit) {
1192         cpu_exit(cpu);
1193     }
1194     qemu_cond_broadcast(&qemu_pause_cond);
1195 }
1196
1197 static void qemu_wait_io_event_common(CPUState *cpu)
1198 {
1199     atomic_mb_set(&cpu->thread_kicked, false);
1200     if (cpu->stop) {
1201         qemu_cpu_stop(cpu, false);
1202     }
1203     process_queued_cpu_work(cpu);
1204 }
1205
1206 static void qemu_tcg_rr_wait_io_event(void)
1207 {
1208     CPUState *cpu;
1209
1210     while (all_cpu_threads_idle()) {
1211         stop_tcg_kick_timer();
1212         qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1213     }
1214
1215     start_tcg_kick_timer();
1216
1217     CPU_FOREACH(cpu) {
1218         qemu_wait_io_event_common(cpu);
1219     }
1220 }
1221
1222 static void qemu_wait_io_event(CPUState *cpu)
1223 {
1224     bool slept = false;
1225
1226     while (cpu_thread_is_idle(cpu)) {
1227         if (!slept) {
1228             slept = true;
1229             qemu_plugin_vcpu_idle_cb(cpu);
1230         }
1231         qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1232     }
1233     if (slept) {
1234         qemu_plugin_vcpu_resume_cb(cpu);
1235     }
1236
1237 #ifdef _WIN32
1238     /* Eat dummy APC queued by qemu_cpu_kick_thread.  */
1239     if (!tcg_enabled()) {
1240         SleepEx(0, TRUE);
1241     }
1242 #endif
1243     qemu_wait_io_event_common(cpu);
1244 }
1245
1246 static void *qemu_kvm_cpu_thread_fn(void *arg)
1247 {
1248     CPUState *cpu = arg;
1249     int r;
1250
1251     rcu_register_thread();
1252
1253     qemu_mutex_lock_iothread();
1254     qemu_thread_get_self(cpu->thread);
1255     cpu->thread_id = qemu_get_thread_id();
1256     cpu->can_do_io = 1;
1257     current_cpu = cpu;
1258
1259     r = kvm_init_vcpu(cpu);
1260     if (r < 0) {
1261         error_report("kvm_init_vcpu failed: %s", strerror(-r));
1262         exit(1);
1263     }
1264
1265     kvm_init_cpu_signals(cpu);
1266
1267     /* signal CPU creation */
1268     cpu->created = true;
1269     qemu_cond_signal(&qemu_cpu_cond);
1270     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1271
1272     do {
1273         if (cpu_can_run(cpu)) {
1274             r = kvm_cpu_exec(cpu);
1275             if (r == EXCP_DEBUG) {
1276                 cpu_handle_guest_debug(cpu);
1277             }
1278         }
1279         qemu_wait_io_event(cpu);
1280     } while (!cpu->unplug || cpu_can_run(cpu));
1281
1282     qemu_kvm_destroy_vcpu(cpu);
1283     cpu->created = false;
1284     qemu_cond_signal(&qemu_cpu_cond);
1285     qemu_mutex_unlock_iothread();
1286     rcu_unregister_thread();
1287     return NULL;
1288 }
1289
1290 static void *qemu_dummy_cpu_thread_fn(void *arg)
1291 {
1292 #ifdef _WIN32
1293     error_report("qtest is not supported under Windows");
1294     exit(1);
1295 #else
1296     CPUState *cpu = arg;
1297     sigset_t waitset;
1298     int r;
1299
1300     rcu_register_thread();
1301
1302     qemu_mutex_lock_iothread();
1303     qemu_thread_get_self(cpu->thread);
1304     cpu->thread_id = qemu_get_thread_id();
1305     cpu->can_do_io = 1;
1306     current_cpu = cpu;
1307
1308     sigemptyset(&waitset);
1309     sigaddset(&waitset, SIG_IPI);
1310
1311     /* signal CPU creation */
1312     cpu->created = true;
1313     qemu_cond_signal(&qemu_cpu_cond);
1314     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1315
1316     do {
1317         qemu_mutex_unlock_iothread();
1318         do {
1319             int sig;
1320             r = sigwait(&waitset, &sig);
1321         } while (r == -1 && (errno == EAGAIN || errno == EINTR));
1322         if (r == -1) {
1323             perror("sigwait");
1324             exit(1);
1325         }
1326         qemu_mutex_lock_iothread();
1327         qemu_wait_io_event(cpu);
1328     } while (!cpu->unplug);
1329
1330     qemu_mutex_unlock_iothread();
1331     rcu_unregister_thread();
1332     return NULL;
1333 #endif
1334 }
1335
1336 static int64_t tcg_get_icount_limit(void)
1337 {
1338     int64_t deadline;
1339
1340     if (replay_mode != REPLAY_MODE_PLAY) {
1341         /*
1342          * Include all the timers, because they may need an attention.
1343          * Too long CPU execution may create unnecessary delay in UI.
1344          */
1345         deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
1346                                               QEMU_TIMER_ATTR_ALL);
1347         /* Check realtime timers, because they help with input processing */
1348         deadline = qemu_soonest_timeout(deadline,
1349                 qemu_clock_deadline_ns_all(QEMU_CLOCK_REALTIME,
1350                                            QEMU_TIMER_ATTR_ALL));
1351
1352         /* Maintain prior (possibly buggy) behaviour where if no deadline
1353          * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
1354          * INT32_MAX nanoseconds ahead, we still use INT32_MAX
1355          * nanoseconds.
1356          */
1357         if ((deadline < 0) || (deadline > INT32_MAX)) {
1358             deadline = INT32_MAX;
1359         }
1360
1361         return qemu_icount_round(deadline);
1362     } else {
1363         return replay_get_instructions();
1364     }
1365 }
1366
1367 static void handle_icount_deadline(void)
1368 {
1369     assert(qemu_in_vcpu_thread());
1370     if (use_icount) {
1371         int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
1372                                                       QEMU_TIMER_ATTR_ALL);
1373
1374         if (deadline == 0) {
1375             /* Wake up other AioContexts.  */
1376             qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
1377             qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
1378         }
1379     }
1380 }
1381
1382 static void prepare_icount_for_run(CPUState *cpu)
1383 {
1384     if (use_icount) {
1385         int insns_left;
1386
1387         /* These should always be cleared by process_icount_data after
1388          * each vCPU execution. However u16.high can be raised
1389          * asynchronously by cpu_exit/cpu_interrupt/tcg_handle_interrupt
1390          */
1391         g_assert(cpu_neg(cpu)->icount_decr.u16.low == 0);
1392         g_assert(cpu->icount_extra == 0);
1393
1394         cpu->icount_budget = tcg_get_icount_limit();
1395         insns_left = MIN(0xffff, cpu->icount_budget);
1396         cpu_neg(cpu)->icount_decr.u16.low = insns_left;
1397         cpu->icount_extra = cpu->icount_budget - insns_left;
1398
1399         replay_mutex_lock();
1400     }
1401 }
1402
1403 static void process_icount_data(CPUState *cpu)
1404 {
1405     if (use_icount) {
1406         /* Account for executed instructions */
1407         cpu_update_icount(cpu);
1408
1409         /* Reset the counters */
1410         cpu_neg(cpu)->icount_decr.u16.low = 0;
1411         cpu->icount_extra = 0;
1412         cpu->icount_budget = 0;
1413
1414         replay_account_executed_instructions();
1415
1416         replay_mutex_unlock();
1417     }
1418 }
1419
1420
1421 static int tcg_cpu_exec(CPUState *cpu)
1422 {
1423     int ret;
1424 #ifdef CONFIG_PROFILER
1425     int64_t ti;
1426 #endif
1427
1428     assert(tcg_enabled());
1429 #ifdef CONFIG_PROFILER
1430     ti = profile_getclock();
1431 #endif
1432     cpu_exec_start(cpu);
1433     ret = cpu_exec(cpu);
1434     cpu_exec_end(cpu);
1435 #ifdef CONFIG_PROFILER
1436     atomic_set(&tcg_ctx->prof.cpu_exec_time,
1437                tcg_ctx->prof.cpu_exec_time + profile_getclock() - ti);
1438 #endif
1439     return ret;
1440 }
1441
1442 /* Destroy any remaining vCPUs which have been unplugged and have
1443  * finished running
1444  */
1445 static void deal_with_unplugged_cpus(void)
1446 {
1447     CPUState *cpu;
1448
1449     CPU_FOREACH(cpu) {
1450         if (cpu->unplug && !cpu_can_run(cpu)) {
1451             qemu_tcg_destroy_vcpu(cpu);
1452             cpu->created = false;
1453             qemu_cond_signal(&qemu_cpu_cond);
1454             break;
1455         }
1456     }
1457 }
1458
1459 /* Single-threaded TCG
1460  *
1461  * In the single-threaded case each vCPU is simulated in turn. If
1462  * there is more than a single vCPU we create a simple timer to kick
1463  * the vCPU and ensure we don't get stuck in a tight loop in one vCPU.
1464  * This is done explicitly rather than relying on side-effects
1465  * elsewhere.
1466  */
1467
1468 static void *qemu_tcg_rr_cpu_thread_fn(void *arg)
1469 {
1470     CPUState *cpu = arg;
1471
1472     assert(tcg_enabled());
1473     rcu_register_thread();
1474     tcg_register_thread();
1475
1476     qemu_mutex_lock_iothread();
1477     qemu_thread_get_self(cpu->thread);
1478
1479     cpu->thread_id = qemu_get_thread_id();
1480     cpu->created = true;
1481     cpu->can_do_io = 1;
1482     qemu_cond_signal(&qemu_cpu_cond);
1483     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1484
1485     /* wait for initial kick-off after machine start */
1486     while (first_cpu->stopped) {
1487         qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1488
1489         /* process any pending work */
1490         CPU_FOREACH(cpu) {
1491             current_cpu = cpu;
1492             qemu_wait_io_event_common(cpu);
1493         }
1494     }
1495
1496     start_tcg_kick_timer();
1497
1498     cpu = first_cpu;
1499
1500     /* process any pending work */
1501     cpu->exit_request = 1;
1502
1503     while (1) {
1504         qemu_mutex_unlock_iothread();
1505         replay_mutex_lock();
1506         qemu_mutex_lock_iothread();
1507         /* Account partial waits to QEMU_CLOCK_VIRTUAL.  */
1508         qemu_account_warp_timer();
1509
1510         /* Run the timers here.  This is much more efficient than
1511          * waking up the I/O thread and waiting for completion.
1512          */
1513         handle_icount_deadline();
1514
1515         replay_mutex_unlock();
1516
1517         if (!cpu) {
1518             cpu = first_cpu;
1519         }
1520
1521         while (cpu && !cpu->queued_work_first && !cpu->exit_request) {
1522
1523             atomic_mb_set(&tcg_current_rr_cpu, cpu);
1524             current_cpu = cpu;
1525
1526             qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
1527                               (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
1528
1529             if (cpu_can_run(cpu)) {
1530                 int r;
1531
1532                 qemu_mutex_unlock_iothread();
1533                 prepare_icount_for_run(cpu);
1534
1535                 r = tcg_cpu_exec(cpu);
1536
1537                 process_icount_data(cpu);
1538                 qemu_mutex_lock_iothread();
1539
1540                 if (r == EXCP_DEBUG) {
1541                     cpu_handle_guest_debug(cpu);
1542                     break;
1543                 } else if (r == EXCP_ATOMIC) {
1544                     qemu_mutex_unlock_iothread();
1545                     cpu_exec_step_atomic(cpu);
1546                     qemu_mutex_lock_iothread();
1547                     break;
1548                 }
1549             } else if (cpu->stop) {
1550                 if (cpu->unplug) {
1551                     cpu = CPU_NEXT(cpu);
1552                 }
1553                 break;
1554             }
1555
1556             cpu = CPU_NEXT(cpu);
1557         } /* while (cpu && !cpu->exit_request).. */
1558
1559         /* Does not need atomic_mb_set because a spurious wakeup is okay.  */
1560         atomic_set(&tcg_current_rr_cpu, NULL);
1561
1562         if (cpu && cpu->exit_request) {
1563             atomic_mb_set(&cpu->exit_request, 0);
1564         }
1565
1566         if (use_icount && all_cpu_threads_idle()) {
1567             /*
1568              * When all cpus are sleeping (e.g in WFI), to avoid a deadlock
1569              * in the main_loop, wake it up in order to start the warp timer.
1570              */
1571             qemu_notify_event();
1572         }
1573
1574         qemu_tcg_rr_wait_io_event();
1575         deal_with_unplugged_cpus();
1576     }
1577
1578     rcu_unregister_thread();
1579     return NULL;
1580 }
1581
1582 static void *qemu_hax_cpu_thread_fn(void *arg)
1583 {
1584     CPUState *cpu = arg;
1585     int r;
1586
1587     rcu_register_thread();
1588     qemu_mutex_lock_iothread();
1589     qemu_thread_get_self(cpu->thread);
1590
1591     cpu->thread_id = qemu_get_thread_id();
1592     cpu->created = true;
1593     current_cpu = cpu;
1594
1595     hax_init_vcpu(cpu);
1596     qemu_cond_signal(&qemu_cpu_cond);
1597     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1598
1599     do {
1600         if (cpu_can_run(cpu)) {
1601             r = hax_smp_cpu_exec(cpu);
1602             if (r == EXCP_DEBUG) {
1603                 cpu_handle_guest_debug(cpu);
1604             }
1605         }
1606
1607         qemu_wait_io_event(cpu);
1608     } while (!cpu->unplug || cpu_can_run(cpu));
1609     rcu_unregister_thread();
1610     return NULL;
1611 }
1612
1613 /* The HVF-specific vCPU thread function. This one should only run when the host
1614  * CPU supports the VMX "unrestricted guest" feature. */
1615 static void *qemu_hvf_cpu_thread_fn(void *arg)
1616 {
1617     CPUState *cpu = arg;
1618
1619     int r;
1620
1621     assert(hvf_enabled());
1622
1623     rcu_register_thread();
1624
1625     qemu_mutex_lock_iothread();
1626     qemu_thread_get_self(cpu->thread);
1627
1628     cpu->thread_id = qemu_get_thread_id();
1629     cpu->can_do_io = 1;
1630     current_cpu = cpu;
1631
1632     hvf_init_vcpu(cpu);
1633
1634     /* signal CPU creation */
1635     cpu->created = true;
1636     qemu_cond_signal(&qemu_cpu_cond);
1637     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1638
1639     do {
1640         if (cpu_can_run(cpu)) {
1641             r = hvf_vcpu_exec(cpu);
1642             if (r == EXCP_DEBUG) {
1643                 cpu_handle_guest_debug(cpu);
1644             }
1645         }
1646         qemu_wait_io_event(cpu);
1647     } while (!cpu->unplug || cpu_can_run(cpu));
1648
1649     hvf_vcpu_destroy(cpu);
1650     cpu->created = false;
1651     qemu_cond_signal(&qemu_cpu_cond);
1652     qemu_mutex_unlock_iothread();
1653     rcu_unregister_thread();
1654     return NULL;
1655 }
1656
1657 static void *qemu_whpx_cpu_thread_fn(void *arg)
1658 {
1659     CPUState *cpu = arg;
1660     int r;
1661
1662     rcu_register_thread();
1663
1664     qemu_mutex_lock_iothread();
1665     qemu_thread_get_self(cpu->thread);
1666     cpu->thread_id = qemu_get_thread_id();
1667     current_cpu = cpu;
1668
1669     r = whpx_init_vcpu(cpu);
1670     if (r < 0) {
1671         fprintf(stderr, "whpx_init_vcpu failed: %s\n", strerror(-r));
1672         exit(1);
1673     }
1674
1675     /* signal CPU creation */
1676     cpu->created = true;
1677     qemu_cond_signal(&qemu_cpu_cond);
1678     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1679
1680     do {
1681         if (cpu_can_run(cpu)) {
1682             r = whpx_vcpu_exec(cpu);
1683             if (r == EXCP_DEBUG) {
1684                 cpu_handle_guest_debug(cpu);
1685             }
1686         }
1687         while (cpu_thread_is_idle(cpu)) {
1688             qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1689         }
1690         qemu_wait_io_event_common(cpu);
1691     } while (!cpu->unplug || cpu_can_run(cpu));
1692
1693     whpx_destroy_vcpu(cpu);
1694     cpu->created = false;
1695     qemu_cond_signal(&qemu_cpu_cond);
1696     qemu_mutex_unlock_iothread();
1697     rcu_unregister_thread();
1698     return NULL;
1699 }
1700
1701 #ifdef _WIN32
1702 static void CALLBACK dummy_apc_func(ULONG_PTR unused)
1703 {
1704 }
1705 #endif
1706
1707 /* Multi-threaded TCG
1708  *
1709  * In the multi-threaded case each vCPU has its own thread. The TLS
1710  * variable current_cpu can be used deep in the code to find the
1711  * current CPUState for a given thread.
1712  */
1713
1714 static void *qemu_tcg_cpu_thread_fn(void *arg)
1715 {
1716     CPUState *cpu = arg;
1717
1718     assert(tcg_enabled());
1719     g_assert(!use_icount);
1720
1721     rcu_register_thread();
1722     tcg_register_thread();
1723
1724     qemu_mutex_lock_iothread();
1725     qemu_thread_get_self(cpu->thread);
1726
1727     cpu->thread_id = qemu_get_thread_id();
1728     cpu->created = true;
1729     cpu->can_do_io = 1;
1730     current_cpu = cpu;
1731     qemu_cond_signal(&qemu_cpu_cond);
1732     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1733
1734     /* process any pending work */
1735     cpu->exit_request = 1;
1736
1737     do {
1738         if (cpu_can_run(cpu)) {
1739             int r;
1740             qemu_mutex_unlock_iothread();
1741             r = tcg_cpu_exec(cpu);
1742             qemu_mutex_lock_iothread();
1743             switch (r) {
1744             case EXCP_DEBUG:
1745                 cpu_handle_guest_debug(cpu);
1746                 break;
1747             case EXCP_HALTED:
1748                 /* during start-up the vCPU is reset and the thread is
1749                  * kicked several times. If we don't ensure we go back
1750                  * to sleep in the halted state we won't cleanly
1751                  * start-up when the vCPU is enabled.
1752                  *
1753                  * cpu->halted should ensure we sleep in wait_io_event
1754                  */
1755                 g_assert(cpu->halted);
1756                 break;
1757             case EXCP_ATOMIC:
1758                 qemu_mutex_unlock_iothread();
1759                 cpu_exec_step_atomic(cpu);
1760                 qemu_mutex_lock_iothread();
1761             default:
1762                 /* Ignore everything else? */
1763                 break;
1764             }
1765         }
1766
1767         atomic_mb_set(&cpu->exit_request, 0);
1768         qemu_wait_io_event(cpu);
1769     } while (!cpu->unplug || cpu_can_run(cpu));
1770
1771     qemu_tcg_destroy_vcpu(cpu);
1772     cpu->created = false;
1773     qemu_cond_signal(&qemu_cpu_cond);
1774     qemu_mutex_unlock_iothread();
1775     rcu_unregister_thread();
1776     return NULL;
1777 }
1778
1779 static void qemu_cpu_kick_thread(CPUState *cpu)
1780 {
1781 #ifndef _WIN32
1782     int err;
1783
1784     if (cpu->thread_kicked) {
1785         return;
1786     }
1787     cpu->thread_kicked = true;
1788     err = pthread_kill(cpu->thread->thread, SIG_IPI);
1789     if (err && err != ESRCH) {
1790         fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
1791         exit(1);
1792     }
1793 #else /* _WIN32 */
1794     if (!qemu_cpu_is_self(cpu)) {
1795         if (whpx_enabled()) {
1796             whpx_vcpu_kick(cpu);
1797         } else if (!QueueUserAPC(dummy_apc_func, cpu->hThread, 0)) {
1798             fprintf(stderr, "%s: QueueUserAPC failed with error %lu\n",
1799                     __func__, GetLastError());
1800             exit(1);
1801         }
1802     }
1803 #endif
1804 }
1805
1806 void qemu_cpu_kick(CPUState *cpu)
1807 {
1808     qemu_cond_broadcast(cpu->halt_cond);
1809     if (tcg_enabled()) {
1810         if (qemu_tcg_mttcg_enabled()) {
1811             cpu_exit(cpu);
1812         } else {
1813             qemu_cpu_kick_rr_cpus();
1814         }
1815     } else {
1816         if (hax_enabled()) {
1817             /*
1818              * FIXME: race condition with the exit_request check in
1819              * hax_vcpu_hax_exec
1820              */
1821             cpu->exit_request = 1;
1822         }
1823         qemu_cpu_kick_thread(cpu);
1824     }
1825 }
1826
1827 void qemu_cpu_kick_self(void)
1828 {
1829     assert(current_cpu);
1830     qemu_cpu_kick_thread(current_cpu);
1831 }
1832
1833 bool qemu_cpu_is_self(CPUState *cpu)
1834 {
1835     return qemu_thread_is_self(cpu->thread);
1836 }
1837
1838 bool qemu_in_vcpu_thread(void)
1839 {
1840     return current_cpu && qemu_cpu_is_self(current_cpu);
1841 }
1842
1843 static __thread bool iothread_locked = false;
1844
1845 bool qemu_mutex_iothread_locked(void)
1846 {
1847     return iothread_locked;
1848 }
1849
1850 /*
1851  * The BQL is taken from so many places that it is worth profiling the
1852  * callers directly, instead of funneling them all through a single function.
1853  */
1854 void qemu_mutex_lock_iothread_impl(const char *file, int line)
1855 {
1856     QemuMutexLockFunc bql_lock = atomic_read(&qemu_bql_mutex_lock_func);
1857
1858     g_assert(!qemu_mutex_iothread_locked());
1859     bql_lock(&qemu_global_mutex, file, line);
1860     iothread_locked = true;
1861 }
1862
1863 void qemu_mutex_unlock_iothread(void)
1864 {
1865     g_assert(qemu_mutex_iothread_locked());
1866     iothread_locked = false;
1867     qemu_mutex_unlock(&qemu_global_mutex);
1868 }
1869
1870 void qemu_cond_wait_iothread(QemuCond *cond)
1871 {
1872     qemu_cond_wait(cond, &qemu_global_mutex);
1873 }
1874
1875 static bool all_vcpus_paused(void)
1876 {
1877     CPUState *cpu;
1878
1879     CPU_FOREACH(cpu) {
1880         if (!cpu->stopped) {
1881             return false;
1882         }
1883     }
1884
1885     return true;
1886 }
1887
1888 void pause_all_vcpus(void)
1889 {
1890     CPUState *cpu;
1891
1892     qemu_clock_enable(QEMU_CLOCK_VIRTUAL, false);
1893     CPU_FOREACH(cpu) {
1894         if (qemu_cpu_is_self(cpu)) {
1895             qemu_cpu_stop(cpu, true);
1896         } else {
1897             cpu->stop = true;
1898             qemu_cpu_kick(cpu);
1899         }
1900     }
1901
1902     /* We need to drop the replay_lock so any vCPU threads woken up
1903      * can finish their replay tasks
1904      */
1905     replay_mutex_unlock();
1906
1907     while (!all_vcpus_paused()) {
1908         qemu_cond_wait(&qemu_pause_cond, &qemu_global_mutex);
1909         CPU_FOREACH(cpu) {
1910             qemu_cpu_kick(cpu);
1911         }
1912     }
1913
1914     qemu_mutex_unlock_iothread();
1915     replay_mutex_lock();
1916     qemu_mutex_lock_iothread();
1917 }
1918
1919 void cpu_resume(CPUState *cpu)
1920 {
1921     cpu->stop = false;
1922     cpu->stopped = false;
1923     qemu_cpu_kick(cpu);
1924 }
1925
1926 void resume_all_vcpus(void)
1927 {
1928     CPUState *cpu;
1929
1930     if (!runstate_is_running()) {
1931         return;
1932     }
1933
1934     qemu_clock_enable(QEMU_CLOCK_VIRTUAL, true);
1935     CPU_FOREACH(cpu) {
1936         cpu_resume(cpu);
1937     }
1938 }
1939
1940 void cpu_remove_sync(CPUState *cpu)
1941 {
1942     cpu->stop = true;
1943     cpu->unplug = true;
1944     qemu_cpu_kick(cpu);
1945     qemu_mutex_unlock_iothread();
1946     qemu_thread_join(cpu->thread);
1947     qemu_mutex_lock_iothread();
1948 }
1949
1950 /* For temporary buffers for forming a name */
1951 #define VCPU_THREAD_NAME_SIZE 16
1952
1953 static void qemu_tcg_init_vcpu(CPUState *cpu)
1954 {
1955     char thread_name[VCPU_THREAD_NAME_SIZE];
1956     static QemuCond *single_tcg_halt_cond;
1957     static QemuThread *single_tcg_cpu_thread;
1958     static int tcg_region_inited;
1959
1960     assert(tcg_enabled());
1961     /*
1962      * Initialize TCG regions--once. Now is a good time, because:
1963      * (1) TCG's init context, prologue and target globals have been set up.
1964      * (2) qemu_tcg_mttcg_enabled() works now (TCG init code runs before the
1965      *     -accel flag is processed, so the check doesn't work then).
1966      */
1967     if (!tcg_region_inited) {
1968         tcg_region_inited = 1;
1969         tcg_region_init();
1970     }
1971
1972     if (qemu_tcg_mttcg_enabled() || !single_tcg_cpu_thread) {
1973         cpu->thread = g_malloc0(sizeof(QemuThread));
1974         cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1975         qemu_cond_init(cpu->halt_cond);
1976
1977         if (qemu_tcg_mttcg_enabled()) {
1978             /* create a thread per vCPU with TCG (MTTCG) */
1979             parallel_cpus = true;
1980             snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
1981                  cpu->cpu_index);
1982
1983             qemu_thread_create(cpu->thread, thread_name, qemu_tcg_cpu_thread_fn,
1984                                cpu, QEMU_THREAD_JOINABLE);
1985
1986         } else {
1987             /* share a single thread for all cpus with TCG */
1988             snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "ALL CPUs/TCG");
1989             qemu_thread_create(cpu->thread, thread_name,
1990                                qemu_tcg_rr_cpu_thread_fn,
1991                                cpu, QEMU_THREAD_JOINABLE);
1992
1993             single_tcg_halt_cond = cpu->halt_cond;
1994             single_tcg_cpu_thread = cpu->thread;
1995         }
1996 #ifdef _WIN32
1997         cpu->hThread = qemu_thread_get_handle(cpu->thread);
1998 #endif
1999     } else {
2000         /* For non-MTTCG cases we share the thread */
2001         cpu->thread = single_tcg_cpu_thread;
2002         cpu->halt_cond = single_tcg_halt_cond;
2003         cpu->thread_id = first_cpu->thread_id;
2004         cpu->can_do_io = 1;
2005         cpu->created = true;
2006     }
2007 }
2008
2009 static void qemu_hax_start_vcpu(CPUState *cpu)
2010 {
2011     char thread_name[VCPU_THREAD_NAME_SIZE];
2012
2013     cpu->thread = g_malloc0(sizeof(QemuThread));
2014     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2015     qemu_cond_init(cpu->halt_cond);
2016
2017     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HAX",
2018              cpu->cpu_index);
2019     qemu_thread_create(cpu->thread, thread_name, qemu_hax_cpu_thread_fn,
2020                        cpu, QEMU_THREAD_JOINABLE);
2021 #ifdef _WIN32
2022     cpu->hThread = qemu_thread_get_handle(cpu->thread);
2023 #endif
2024 }
2025
2026 static void qemu_kvm_start_vcpu(CPUState *cpu)
2027 {
2028     char thread_name[VCPU_THREAD_NAME_SIZE];
2029
2030     cpu->thread = g_malloc0(sizeof(QemuThread));
2031     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2032     qemu_cond_init(cpu->halt_cond);
2033     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/KVM",
2034              cpu->cpu_index);
2035     qemu_thread_create(cpu->thread, thread_name, qemu_kvm_cpu_thread_fn,
2036                        cpu, QEMU_THREAD_JOINABLE);
2037 }
2038
2039 static void qemu_hvf_start_vcpu(CPUState *cpu)
2040 {
2041     char thread_name[VCPU_THREAD_NAME_SIZE];
2042
2043     /* HVF currently does not support TCG, and only runs in
2044      * unrestricted-guest mode. */
2045     assert(hvf_enabled());
2046
2047     cpu->thread = g_malloc0(sizeof(QemuThread));
2048     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2049     qemu_cond_init(cpu->halt_cond);
2050
2051     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HVF",
2052              cpu->cpu_index);
2053     qemu_thread_create(cpu->thread, thread_name, qemu_hvf_cpu_thread_fn,
2054                        cpu, QEMU_THREAD_JOINABLE);
2055 }
2056
2057 static void qemu_whpx_start_vcpu(CPUState *cpu)
2058 {
2059     char thread_name[VCPU_THREAD_NAME_SIZE];
2060
2061     cpu->thread = g_malloc0(sizeof(QemuThread));
2062     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2063     qemu_cond_init(cpu->halt_cond);
2064     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/WHPX",
2065              cpu->cpu_index);
2066     qemu_thread_create(cpu->thread, thread_name, qemu_whpx_cpu_thread_fn,
2067                        cpu, QEMU_THREAD_JOINABLE);
2068 #ifdef _WIN32
2069     cpu->hThread = qemu_thread_get_handle(cpu->thread);
2070 #endif
2071 }
2072
2073 static void qemu_dummy_start_vcpu(CPUState *cpu)
2074 {
2075     char thread_name[VCPU_THREAD_NAME_SIZE];
2076
2077     cpu->thread = g_malloc0(sizeof(QemuThread));
2078     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2079     qemu_cond_init(cpu->halt_cond);
2080     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/DUMMY",
2081              cpu->cpu_index);
2082     qemu_thread_create(cpu->thread, thread_name, qemu_dummy_cpu_thread_fn, cpu,
2083                        QEMU_THREAD_JOINABLE);
2084 }
2085
2086 void qemu_init_vcpu(CPUState *cpu)
2087 {
2088     MachineState *ms = MACHINE(qdev_get_machine());
2089
2090     cpu->nr_cores = ms->smp.cores;
2091     cpu->nr_threads =  ms->smp.threads;
2092     cpu->stopped = true;
2093     cpu->random_seed = qemu_guest_random_seed_thread_part1();
2094
2095     if (!cpu->as) {
2096         /* If the target cpu hasn't set up any address spaces itself,
2097          * give it the default one.
2098          */
2099         cpu->num_ases = 1;
2100         cpu_address_space_init(cpu, 0, "cpu-memory", cpu->memory);
2101     }
2102
2103     if (kvm_enabled()) {
2104         qemu_kvm_start_vcpu(cpu);
2105     } else if (hax_enabled()) {
2106         qemu_hax_start_vcpu(cpu);
2107     } else if (hvf_enabled()) {
2108         qemu_hvf_start_vcpu(cpu);
2109     } else if (tcg_enabled()) {
2110         qemu_tcg_init_vcpu(cpu);
2111     } else if (whpx_enabled()) {
2112         qemu_whpx_start_vcpu(cpu);
2113     } else {
2114         qemu_dummy_start_vcpu(cpu);
2115     }
2116
2117     while (!cpu->created) {
2118         qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
2119     }
2120 }
2121
2122 void cpu_stop_current(void)
2123 {
2124     if (current_cpu) {
2125         current_cpu->stop = true;
2126         cpu_exit(current_cpu);
2127     }
2128 }
2129
2130 int vm_stop(RunState state)
2131 {
2132     if (qemu_in_vcpu_thread()) {
2133         qemu_system_vmstop_request_prepare();
2134         qemu_system_vmstop_request(state);
2135         /*
2136          * FIXME: should not return to device code in case
2137          * vm_stop() has been requested.
2138          */
2139         cpu_stop_current();
2140         return 0;
2141     }
2142
2143     return do_vm_stop(state, true);
2144 }
2145
2146 /**
2147  * Prepare for (re)starting the VM.
2148  * Returns -1 if the vCPUs are not to be restarted (e.g. if they are already
2149  * running or in case of an error condition), 0 otherwise.
2150  */
2151 int vm_prepare_start(void)
2152 {
2153     RunState requested;
2154
2155     qemu_vmstop_requested(&requested);
2156     if (runstate_is_running() && requested == RUN_STATE__MAX) {
2157         return -1;
2158     }
2159
2160     /* Ensure that a STOP/RESUME pair of events is emitted if a
2161      * vmstop request was pending.  The BLOCK_IO_ERROR event, for
2162      * example, according to documentation is always followed by
2163      * the STOP event.
2164      */
2165     if (runstate_is_running()) {
2166         qapi_event_send_stop();
2167         qapi_event_send_resume();
2168         return -1;
2169     }
2170
2171     /* We are sending this now, but the CPUs will be resumed shortly later */
2172     qapi_event_send_resume();
2173
2174     cpu_enable_ticks();
2175     runstate_set(RUN_STATE_RUNNING);
2176     vm_state_notify(1, RUN_STATE_RUNNING);
2177     return 0;
2178 }
2179
2180 void vm_start(void)
2181 {
2182     if (!vm_prepare_start()) {
2183         resume_all_vcpus();
2184     }
2185 }
2186
2187 /* does a state transition even if the VM is already stopped,
2188    current state is forgotten forever */
2189 int vm_stop_force_state(RunState state)
2190 {
2191     if (runstate_is_running()) {
2192         return vm_stop(state);
2193     } else {
2194         runstate_set(state);
2195
2196         bdrv_drain_all();
2197         /* Make sure to return an error if the flush in a previous vm_stop()
2198          * failed. */
2199         return bdrv_flush_all();
2200     }
2201 }
2202
2203 void list_cpus(const char *optarg)
2204 {
2205     /* XXX: implement xxx_cpu_list for targets that still miss it */
2206 #if defined(cpu_list)
2207     cpu_list();
2208 #endif
2209 }
2210
2211 void qmp_memsave(int64_t addr, int64_t size, const char *filename,
2212                  bool has_cpu, int64_t cpu_index, Error **errp)
2213 {
2214     FILE *f;
2215     uint32_t l;
2216     CPUState *cpu;
2217     uint8_t buf[1024];
2218     int64_t orig_addr = addr, orig_size = size;
2219
2220     if (!has_cpu) {
2221         cpu_index = 0;
2222     }
2223
2224     cpu = qemu_get_cpu(cpu_index);
2225     if (cpu == NULL) {
2226         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cpu-index",
2227                    "a CPU number");
2228         return;
2229     }
2230
2231     f = fopen(filename, "wb");
2232     if (!f) {
2233         error_setg_file_open(errp, errno, filename);
2234         return;
2235     }
2236
2237     while (size != 0) {
2238         l = sizeof(buf);
2239         if (l > size)
2240             l = size;
2241         if (cpu_memory_rw_debug(cpu, addr, buf, l, 0) != 0) {
2242             error_setg(errp, "Invalid addr 0x%016" PRIx64 "/size %" PRId64
2243                              " specified", orig_addr, orig_size);
2244             goto exit;
2245         }
2246         if (fwrite(buf, 1, l, f) != l) {
2247             error_setg(errp, QERR_IO_ERROR);
2248             goto exit;
2249         }
2250         addr += l;
2251         size -= l;
2252     }
2253
2254 exit:
2255     fclose(f);
2256 }
2257
2258 void qmp_pmemsave(int64_t addr, int64_t size, const char *filename,
2259                   Error **errp)
2260 {
2261     FILE *f;
2262     uint32_t l;
2263     uint8_t buf[1024];
2264
2265     f = fopen(filename, "wb");
2266     if (!f) {
2267         error_setg_file_open(errp, errno, filename);
2268         return;
2269     }
2270
2271     while (size != 0) {
2272         l = sizeof(buf);
2273         if (l > size)
2274             l = size;
2275         cpu_physical_memory_read(addr, buf, l);
2276         if (fwrite(buf, 1, l, f) != l) {
2277             error_setg(errp, QERR_IO_ERROR);
2278             goto exit;
2279         }
2280         addr += l;
2281         size -= l;
2282     }
2283
2284 exit:
2285     fclose(f);
2286 }
2287
2288 void qmp_inject_nmi(Error **errp)
2289 {
2290     nmi_monitor_handle(monitor_get_cpu_index(), errp);
2291 }
2292
2293 void dump_drift_info(void)
2294 {
2295     if (!use_icount) {
2296         return;
2297     }
2298
2299     qemu_printf("Host - Guest clock  %"PRIi64" ms\n",
2300                 (cpu_get_clock() - cpu_get_icount())/SCALE_MS);
2301     if (icount_align_option) {
2302         qemu_printf("Max guest delay     %"PRIi64" ms\n",
2303                     -max_delay / SCALE_MS);
2304         qemu_printf("Max guest advance   %"PRIi64" ms\n",
2305                     max_advance / SCALE_MS);
2306     } else {
2307         qemu_printf("Max guest delay     NA\n");
2308         qemu_printf("Max guest advance   NA\n");
2309     }
2310 }