arch/x86/kvm/x86.c

   1 /*
   2  * Kernel-based Virtual Machine driver for Linux
   3  *
   4  * derived from drivers/kvm/kvm_main.c
   5  *
   6  * Copyright (C) 2006 Qumranet, Inc.
   7  *
   8  * Authors:
   9  *   Avi Kivity   <avi@qumranet.com>
  10  *   Yaniv Kamay  <yaniv@qumranet.com>
  11  *
  12  * This work is licensed under the terms of the GNU GPL, version 2.  See
  13  * the COPYING file in the top-level directory.
  14  *
  15  */
  16
  17 #include <linux/kvm_host.h>
  18 #include "irq.h"
  19 #include "mmu.h"
  20 #include "i8254.h"
  21
  22 #include <linux/clocksource.h>
  23 #include <linux/kvm.h>
  24 #include <linux/fs.h>
  25 #include <linux/vmalloc.h>
  26 #include <linux/module.h>
  27 #include <linux/mman.h>
  28 #include <linux/highmem.h>
  29
  30 #include <asm/uaccess.h>
  31 #include <asm/msr.h>
  32 #include <asm/desc.h>
  33
  34 #define MAX_IO_MSRS 256
  35 #define CR0_RESERVED_BITS                                               \
  36         (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
  37                           | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
  38                           | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG))
  39 #define CR4_RESERVED_BITS                                               \
  40         (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
  41                           | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE     \
  42                           | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR  \
  43                           | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
  44
  45 #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
  46 /* EFER defaults:
  47  * - enable syscall per default because its emulated by KVM
  48  * - enable LME and LMA per default on 64 bit KVM
  49  */
  50 #ifdef CONFIG_X86_64
  51 static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffafeULL;
  52 #else
  53 static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffffeULL;
  54 #endif
  55
  56 #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
  57 #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
  58
  59 static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
  60                                     struct kvm_cpuid_entry2 __user *entries);
  61
  62 struct kvm_x86_ops *kvm_x86_ops;
  63
  64 struct kvm_stats_debugfs_item debugfs_entries[] = {
  65         { "pf_fixed", VCPU_STAT(pf_fixed) },
  66         { "pf_guest", VCPU_STAT(pf_guest) },
  67         { "tlb_flush", VCPU_STAT(tlb_flush) },
  68         { "invlpg", VCPU_STAT(invlpg) },
  69         { "exits", VCPU_STAT(exits) },
  70         { "io_exits", VCPU_STAT(io_exits) },
  71         { "mmio_exits", VCPU_STAT(mmio_exits) },
  72         { "signal_exits", VCPU_STAT(signal_exits) },
  73         { "irq_window", VCPU_STAT(irq_window_exits) },
  74         { "halt_exits", VCPU_STAT(halt_exits) },
  75         { "halt_wakeup", VCPU_STAT(halt_wakeup) },
  76         { "hypercalls", VCPU_STAT(hypercalls) },
  77         { "request_irq", VCPU_STAT(request_irq_exits) },
  78         { "irq_exits", VCPU_STAT(irq_exits) },
  79         { "host_state_reload", VCPU_STAT(host_state_reload) },
  80         { "efer_reload", VCPU_STAT(efer_reload) },
  81         { "fpu_reload", VCPU_STAT(fpu_reload) },
  82         { "insn_emulation", VCPU_STAT(insn_emulation) },
  83         { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
  84         { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
  85         { "mmu_pte_write", VM_STAT(mmu_pte_write) },
  86         { "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
  87         { "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) },
  88         { "mmu_flooded", VM_STAT(mmu_flooded) },
  89         { "mmu_recycled", VM_STAT(mmu_recycled) },
  90         { "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
  91         { "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
  92         { "largepages", VM_STAT(lpages) },
  93         { NULL }
  94 };
  95
  96
  97 unsigned long segment_base(u16 selector)
  98 {
  99         struct descriptor_table gdt;
 100         struct desc_struct *d;
 101         unsigned long table_base;
 102         unsigned long v;
 103
 104         if (selector == 0)
 105                 return 0;
 106
 107         asm("sgdt %0" : "=m"(gdt));
 108         table_base = gdt.base;
 109
 110         if (selector & 4) {           /* from ldt */
 111                 u16 ldt_selector;
 112
 113                 asm("sldt %0" : "=g"(ldt_selector));
 114                 table_base = segment_base(ldt_selector);
 115         }
 116         d = (struct desc_struct *)(table_base + (selector & ~7));
 117         v = d->base0 | ((unsigned long)d->base1 << 16) |
 118                 ((unsigned long)d->base2 << 24);
 119 #ifdef CONFIG_X86_64
 120         if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
 121                 v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32;
 122 #endif
 123         return v;
 124 }
 125 EXPORT_SYMBOL_GPL(segment_base);
 126
 127 u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
 128 {
 129         if (irqchip_in_kernel(vcpu->kvm))
 130                 return vcpu->arch.apic_base;
 131         else
 132                 return vcpu->arch.apic_base;
 133 }
 134 EXPORT_SYMBOL_GPL(kvm_get_apic_base);
 135
 136 void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data)
 137 {
 138         /* TODO: reserve bits check */
 139         if (irqchip_in_kernel(vcpu->kvm))
 140                 kvm_lapic_set_base(vcpu, data);
 141         else
 142                 vcpu->arch.apic_base = data;
 143 }
 144 EXPORT_SYMBOL_GPL(kvm_set_apic_base);
 145
 146 void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
 147 {
 148         WARN_ON(vcpu->arch.exception.pending);
 149         vcpu->arch.exception.pending = true;
 150         vcpu->arch.exception.has_error_code = false;
 151         vcpu->arch.exception.nr = nr;
 152 }
 153 EXPORT_SYMBOL_GPL(kvm_queue_exception);
 154
 155 void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr,
 156                            u32 error_code)
 157 {
 158         ++vcpu->stat.pf_guest;
 159         if (vcpu->arch.exception.pending) {
 160                 if (vcpu->arch.exception.nr == PF_VECTOR) {
 161                         printk(KERN_DEBUG "kvm: inject_page_fault:"
 162                                         " double fault 0x%lx\n", addr);
 163                         vcpu->arch.exception.nr = DF_VECTOR;
 164                         vcpu->arch.exception.error_code = 0;
 165                 } else if (vcpu->arch.exception.nr == DF_VECTOR) {
 166                         /* triple fault -> shutdown */
 167                         set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
 168                 }
 169                 return;
 170         }
 171         vcpu->arch.cr2 = addr;
 172         kvm_queue_exception_e(vcpu, PF_VECTOR, error_code);
 173 }
 174
 175 void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
 176 {
 177         WARN_ON(vcpu->arch.exception.pending);
 178         vcpu->arch.exception.pending = true;
 179         vcpu->arch.exception.has_error_code = true;
 180         vcpu->arch.exception.nr = nr;
 181         vcpu->arch.exception.error_code = error_code;
 182 }
 183 EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
 184
 185 static void __queue_exception(struct kvm_vcpu *vcpu)
 186 {
 187         kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr,
 188                                      vcpu->arch.exception.has_error_code,
 189                                      vcpu->arch.exception.error_code);
 190 }
 191
 192 /*
 193  * Load the pae pdptrs.  Return true is they are all valid.
 194  */
 195 int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
 196 {
 197         gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
 198         unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
 199         int i;
 200         int ret;
 201         u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
 202
 203         down_read(&vcpu->kvm->slots_lock);
 204         ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte,
 205                                   offset * sizeof(u64), sizeof(pdpte));
 206         if (ret < 0) {
 207                 ret = 0;
 208                 goto out;
 209         }
 210         for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
 211                 if ((pdpte[i] & 1) && (pdpte[i] & 0xfffffff0000001e6ull)) {
 212                         ret = 0;
 213                         goto out;
 214                 }
 215         }
 216         ret = 1;
 217
 218         memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs));
 219 out:
 220         up_read(&vcpu->kvm->slots_lock);
 221
 222         return ret;
 223 }
 224 EXPORT_SYMBOL_GPL(load_pdptrs);
 225
 226 static bool pdptrs_changed(struct kvm_vcpu *vcpu)
 227 {
 228         u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
 229         bool changed = true;
 230         int r;
 231
 232         if (is_long_mode(vcpu) || !is_pae(vcpu))
 233                 return false;
 234
 235         down_read(&vcpu->kvm->slots_lock);
 236         r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte));
 237         if (r < 0)
 238                 goto out;
 239         changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0;
 240 out:
 241         up_read(&vcpu->kvm->slots_lock);
 242
 243         return changed;
 244 }
 245
 246 void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 247 {
 248         if (cr0 & CR0_RESERVED_BITS) {
 249                 printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n",
 250                        cr0, vcpu->arch.cr0);
 251                 kvm_inject_gp(vcpu, 0);
 252                 return;
 253         }
 254
 255         if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) {
 256                 printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n");
 257                 kvm_inject_gp(vcpu, 0);
 258                 return;
 259         }
 260
 261         if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) {
 262                 printk(KERN_DEBUG "set_cr0: #GP, set PG flag "
 263                        "and a clear PE flag\n");
 264                 kvm_inject_gp(vcpu, 0);
 265                 return;
 266         }
 267
 268         if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
 269 #ifdef CONFIG_X86_64
 270                 if ((vcpu->arch.shadow_efer & EFER_LME)) {
 271                         int cs_db, cs_l;
 272
 273                         if (!is_pae(vcpu)) {
 274                                 printk(KERN_DEBUG "set_cr0: #GP, start paging "
 275                                        "in long mode while PAE is disabled\n");
 276                                 kvm_inject_gp(vcpu, 0);
 277                                 return;
 278                         }
 279                         kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
 280                         if (cs_l) {
 281                                 printk(KERN_DEBUG "set_cr0: #GP, start paging "
 282                                        "in long mode while CS.L == 1\n");
 283                                 kvm_inject_gp(vcpu, 0);
 284                                 return;
 285
 286                         }
 287                 } else
 288 #endif
 289                 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
 290                         printk(KERN_DEBUG "set_cr0: #GP, pdptrs "
 291                                "reserved bits\n");
 292                         kvm_inject_gp(vcpu, 0);
 293                         return;
 294                 }
 295
 296         }
 297
 298         kvm_x86_ops->set_cr0(vcpu, cr0);
 299         vcpu->arch.cr0 = cr0;
 300
 301         kvm_mmu_reset_context(vcpu);
 302         return;
 303 }
 304 EXPORT_SYMBOL_GPL(kvm_set_cr0);
 305
 306 void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
 307 {
 308         kvm_set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f));
 309 }
 310 EXPORT_SYMBOL_GPL(kvm_lmsw);
 311
 312 void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 313 {
 314         if (cr4 & CR4_RESERVED_BITS) {
 315                 printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n");
 316                 kvm_inject_gp(vcpu, 0);
 317                 return;
 318         }
 319
 320         if (is_long_mode(vcpu)) {
 321                 if (!(cr4 & X86_CR4_PAE)) {
 322                         printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while "
 323                                "in long mode\n");
 324                         kvm_inject_gp(vcpu, 0);
 325                         return;
 326                 }
 327         } else if (is_paging(vcpu) && !is_pae(vcpu) && (cr4 & X86_CR4_PAE)
 328                    && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
 329                 printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n");
 330                 kvm_inject_gp(vcpu, 0);
 331                 return;
 332         }
 333
 334         if (cr4 & X86_CR4_VMXE) {
 335                 printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n");
 336                 kvm_inject_gp(vcpu, 0);
 337                 return;
 338         }
 339         kvm_x86_ops->set_cr4(vcpu, cr4);
 340         vcpu->arch.cr4 = cr4;
 341         kvm_mmu_reset_context(vcpu);
 342 }
 343 EXPORT_SYMBOL_GPL(kvm_set_cr4);
 344
 345 void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 346 {
 347         if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) {
 348                 kvm_mmu_flush_tlb(vcpu);
 349                 return;
 350         }
 351
 352         if (is_long_mode(vcpu)) {
 353                 if (cr3 & CR3_L_MODE_RESERVED_BITS) {
 354                         printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n");
 355                         kvm_inject_gp(vcpu, 0);
 356                         return;
 357                 }
 358         } else {
 359                 if (is_pae(vcpu)) {
 360                         if (cr3 & CR3_PAE_RESERVED_BITS) {
 361                                 printk(KERN_DEBUG
 362                                        "set_cr3: #GP, reserved bits\n");
 363                                 kvm_inject_gp(vcpu, 0);
 364                                 return;
 365                         }
 366                         if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) {
 367                                 printk(KERN_DEBUG "set_cr3: #GP, pdptrs "
 368                                        "reserved bits\n");
 369                                 kvm_inject_gp(vcpu, 0);
 370                                 return;
 371                         }
 372                 }
 373                 /*
 374                  * We don't check reserved bits in nonpae mode, because
 375                  * this isn't enforced, and VMware depends on this.
 376                  */
 377         }
 378
 379         down_read(&vcpu->kvm->slots_lock);
 380         /*
 381          * Does the new cr3 value map to physical memory? (Note, we
 382          * catch an invalid cr3 even in real-mode, because it would
 383          * cause trouble later on when we turn on paging anyway.)
 384          *
 385          * A real CPU would silently accept an invalid cr3 and would
 386          * attempt to use it - with largely undefined (and often hard
 387          * to debug) behavior on the guest side.
 388          */
 389         if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
 390                 kvm_inject_gp(vcpu, 0);
 391         else {
 392                 vcpu->arch.cr3 = cr3;
 393                 vcpu->arch.mmu.new_cr3(vcpu);
 394         }
 395         up_read(&vcpu->kvm->slots_lock);
 396 }
 397 EXPORT_SYMBOL_GPL(kvm_set_cr3);
 398
 399 void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
 400 {
 401         if (cr8 & CR8_RESERVED_BITS) {
 402                 printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8);
 403                 kvm_inject_gp(vcpu, 0);
 404                 return;
 405         }
 406         if (irqchip_in_kernel(vcpu->kvm))
 407                 kvm_lapic_set_tpr(vcpu, cr8);
 408         else
 409                 vcpu->arch.cr8 = cr8;
 410 }
 411 EXPORT_SYMBOL_GPL(kvm_set_cr8);
 412
 413 unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
 414 {
 415         if (irqchip_in_kernel(vcpu->kvm))
 416                 return kvm_lapic_get_cr8(vcpu);
 417         else
 418                 return vcpu->arch.cr8;
 419 }
 420 EXPORT_SYMBOL_GPL(kvm_get_cr8);
 421
 422 /*
 423  * List of msr numbers which we expose to userspace through KVM_GET_MSRS
 424  * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
 425  *
 426  * This list is modified at module load time to reflect the
 427  * capabilities of the host cpu.
 428  */
 429 static u32 msrs_to_save[] = {
 430         MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
 431         MSR_K6_STAR,
 432 #ifdef CONFIG_X86_64
 433         MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
 434 #endif
 435         MSR_IA32_TIME_STAMP_COUNTER, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
 436         MSR_IA32_PERF_STATUS,
 437 };
 438
 439 static unsigned num_msrs_to_save;
 440
 441 static u32 emulated_msrs[] = {
 442         MSR_IA32_MISC_ENABLE,
 443 };
 444
 445 static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
 446 {
 447         if (efer & efer_reserved_bits) {
 448                 printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n",
 449                        efer);
 450                 kvm_inject_gp(vcpu, 0);
 451                 return;
 452         }
 453
 454         if (is_paging(vcpu)
 455             && (vcpu->arch.shadow_efer & EFER_LME) != (efer & EFER_LME)) {
 456                 printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n");
 457                 kvm_inject_gp(vcpu, 0);
 458                 return;
 459         }
 460
 461         kvm_x86_ops->set_efer(vcpu, efer);
 462
 463         efer &= ~EFER_LMA;
 464         efer |= vcpu->arch.shadow_efer & EFER_LMA;
 465
 466         vcpu->arch.shadow_efer = efer;
 467 }
 468
 469 void kvm_enable_efer_bits(u64 mask)
 470 {
 471        efer_reserved_bits &= ~mask;
 472 }
 473 EXPORT_SYMBOL_GPL(kvm_enable_efer_bits);
 474
 475
 476 /*
 477  * Writes msr value into into the appropriate "register".
 478  * Returns 0 on success, non-0 otherwise.
 479  * Assumes vcpu_load() was already called.
 480  */
 481 int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
 482 {
 483         return kvm_x86_ops->set_msr(vcpu, msr_index, data);
 484 }
 485
 486 /*
 487  * Adapt set_msr() to msr_io()'s calling convention
 488  */
 489 static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
 490 {
 491         return kvm_set_msr(vcpu, index, *data);
 492 }
 493
 494 static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
 495 {
 496         static int version;
 497         struct kvm_wall_clock wc;
 498         struct timespec wc_ts;
 499
 500         if (!wall_clock)
 501                 return;
 502
 503         version++;
 504
 505         down_read(&kvm->slots_lock);
 506         kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
 507
 508         wc_ts = current_kernel_time();
 509         wc.wc_sec = wc_ts.tv_sec;
 510         wc.wc_nsec = wc_ts.tv_nsec;
 511         wc.wc_version = version;
 512
 513         kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc));
 514
 515         version++;
 516         kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
 517         up_read(&kvm->slots_lock);
 518 }
 519
 520 static void kvm_write_guest_time(struct kvm_vcpu *v)
 521 {
 522         struct timespec ts;
 523         unsigned long flags;
 524         struct kvm_vcpu_arch *vcpu = &v->arch;
 525         void *shared_kaddr;
 526
 527         if ((!vcpu->time_page))
 528                 return;
 529
 530         /* Keep irq disabled to prevent changes to the clock */
 531         local_irq_save(flags);
 532         kvm_get_msr(v, MSR_IA32_TIME_STAMP_COUNTER,
 533                           &vcpu->hv_clock.tsc_timestamp);
 534         ktime_get_ts(&ts);
 535         local_irq_restore(flags);
 536
 537         /* With all the info we got, fill in the values */
 538
 539         vcpu->hv_clock.system_time = ts.tv_nsec +
 540                                      (NSEC_PER_SEC * (u64)ts.tv_sec);
 541         /*
 542          * The interface expects us to write an even number signaling that the
 543          * update is finished. Since the guest won't see the intermediate
 544          * state, we just write "2" at the end
 545          */
 546         vcpu->hv_clock.version = 2;
 547
 548         shared_kaddr = kmap_atomic(vcpu->time_page, KM_USER0);
 549
 550         memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock,
 551                 sizeof(vcpu->hv_clock));
 552
 553         kunmap_atomic(shared_kaddr, KM_USER0);
 554
 555         mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT);
 556 }
 557
 558
 559 int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 560 {
 561         switch (msr) {
 562         case MSR_EFER:
 563                 set_efer(vcpu, data);
 564                 break;
 565         case MSR_IA32_MC0_STATUS:
 566                 pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n",
 567                        __func__, data);
 568                 break;
 569         case MSR_IA32_MCG_STATUS:
 570                 pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n",
 571                         __func__, data);
 572                 break;
 573         case MSR_IA32_MCG_CTL:
 574                 pr_unimpl(vcpu, "%s: MSR_IA32_MCG_CTL 0x%llx, nop\n",
 575                         __func__, data);
 576                 break;
 577         case MSR_IA32_UCODE_REV:
 578         case MSR_IA32_UCODE_WRITE:
 579         case 0x200 ... 0x2ff: /* MTRRs */
 580                 break;
 581         case MSR_IA32_APICBASE:
 582                 kvm_set_apic_base(vcpu, data);
 583                 break;
 584         case MSR_IA32_MISC_ENABLE:
 585                 vcpu->arch.ia32_misc_enable_msr = data;
 586                 break;
 587         case MSR_KVM_WALL_CLOCK:
 588                 vcpu->kvm->arch.wall_clock = data;
 589                 kvm_write_wall_clock(vcpu->kvm, data);
 590                 break;
 591         case MSR_KVM_SYSTEM_TIME: {
 592                 if (vcpu->arch.time_page) {
 593                         kvm_release_page_dirty(vcpu->arch.time_page);
 594                         vcpu->arch.time_page = NULL;
 595                 }
 596
 597                 vcpu->arch.time = data;
 598
 599                 /* we verify if the enable bit is set... */
 600                 if (!(data & 1))
 601                         break;
 602
 603                 /* ...but clean it before doing the actual write */
 604                 vcpu->arch.time_offset = data & ~(PAGE_MASK | 1);
 605
 606                 vcpu->arch.hv_clock.tsc_to_system_mul =
 607                                         clocksource_khz2mult(tsc_khz, 22);
 608                 vcpu->arch.hv_clock.tsc_shift = 22;
 609
 610                 down_read(&current->mm->mmap_sem);
 611                 down_read(&vcpu->kvm->slots_lock);
 612                 vcpu->arch.time_page =
 613                                 gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT);
 614                 up_read(&vcpu->kvm->slots_lock);
 615                 up_read(&current->mm->mmap_sem);
 616
 617                 if (is_error_page(vcpu->arch.time_page)) {
 618                         kvm_release_page_clean(vcpu->arch.time_page);
 619                         vcpu->arch.time_page = NULL;
 620                 }
 621
 622                 kvm_write_guest_time(vcpu);
 623                 break;
 624         }
 625         default:
 626                 pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", msr, data);
 627                 return 1;
 628         }
 629         return 0;
 630 }
 631 EXPORT_SYMBOL_GPL(kvm_set_msr_common);
 632
 633
 634 /*
 635  * Reads an msr value (of 'msr_index') into 'pdata'.
 636  * Returns 0 on success, non-0 otherwise.
 637  * Assumes vcpu_load() was already called.
 638  */
 639 int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
 640 {
 641         return kvm_x86_ops->get_msr(vcpu, msr_index, pdata);
 642 }
 643
 644 int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 645 {
 646         u64 data;
 647
 648         switch (msr) {
 649         case 0xc0010010: /* SYSCFG */
 650         case 0xc0010015: /* HWCR */
 651         case MSR_IA32_PLATFORM_ID:
 652         case MSR_IA32_P5_MC_ADDR:
 653         case MSR_IA32_P5_MC_TYPE:
 654         case MSR_IA32_MC0_CTL:
 655         case MSR_IA32_MCG_STATUS:
 656         case MSR_IA32_MCG_CAP:
 657         case MSR_IA32_MCG_CTL:
 658         case MSR_IA32_MC0_MISC:
 659         case MSR_IA32_MC0_MISC+4:
 660         case MSR_IA32_MC0_MISC+8:
 661         case MSR_IA32_MC0_MISC+12:
 662         case MSR_IA32_MC0_MISC+16:
 663         case MSR_IA32_UCODE_REV:
 664         case MSR_IA32_EBL_CR_POWERON:
 665                 /* MTRR registers */
 666         case 0xfe:
 667         case 0x200 ... 0x2ff:
 668                 data = 0;
 669                 break;
 670         case 0xcd: /* fsb frequency */
 671                 data = 3;
 672                 break;
 673         case MSR_IA32_APICBASE:
 674                 data = kvm_get_apic_base(vcpu);
 675                 break;
 676         case MSR_IA32_MISC_ENABLE:
 677                 data = vcpu->arch.ia32_misc_enable_msr;
 678                 break;
 679         case MSR_IA32_PERF_STATUS:
 680                 /* TSC increment by tick */
 681                 data = 1000ULL;
 682                 /* CPU multiplier */
 683                 data |= (((uint64_t)4ULL) << 40);
 684                 break;
 685         case MSR_EFER:
 686                 data = vcpu->arch.shadow_efer;
 687                 break;
 688         case MSR_KVM_WALL_CLOCK:
 689                 data = vcpu->kvm->arch.wall_clock;
 690                 break;
 691         case MSR_KVM_SYSTEM_TIME:
 692                 data = vcpu->arch.time;
 693                 break;
 694         default:
 695                 pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
 696                 return 1;
 697         }
 698         *pdata = data;
 699         return 0;
 700 }
 701 EXPORT_SYMBOL_GPL(kvm_get_msr_common);
 702
 703 /*
 704  * Read or write a bunch of msrs. All parameters are kernel addresses.
 705  *
 706  * @return number of msrs set successfully.
 707  */
 708 static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
 709                     struct kvm_msr_entry *entries,
 710                     int (*do_msr)(struct kvm_vcpu *vcpu,
 711                                   unsigned index, u64 *data))
 712 {
 713         int i;
 714
 715         vcpu_load(vcpu);
 716
 717         for (i = 0; i < msrs->nmsrs; ++i)
 718                 if (do_msr(vcpu, entries[i].index, &entries[i].data))
 719                         break;
 720
 721         vcpu_put(vcpu);
 722
 723         return i;
 724 }
 725
 726 /*
 727  * Read or write a bunch of msrs. Parameters are user addresses.
 728  *
 729  * @return number of msrs set successfully.
 730  */
 731 static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
 732                   int (*do_msr)(struct kvm_vcpu *vcpu,
 733                                 unsigned index, u64 *data),
 734                   int writeback)
 735 {
 736         struct kvm_msrs msrs;
 737         struct kvm_msr_entry *entries;
 738         int r, n;
 739         unsigned size;
 740
 741         r = -EFAULT;
 742         if (copy_from_user(&msrs, user_msrs, sizeof msrs))
 743                 goto out;
 744
 745         r = -E2BIG;
 746         if (msrs.nmsrs >= MAX_IO_MSRS)
 747                 goto out;
 748
 749         r = -ENOMEM;
 750         size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
 751         entries = vmalloc(size);
 752         if (!entries)
 753                 goto out;
 754
 755         r = -EFAULT;
 756         if (copy_from_user(entries, user_msrs->entries, size))
 757                 goto out_free;
 758
 759         r = n = __msr_io(vcpu, &msrs, entries, do_msr);
 760         if (r < 0)
 761                 goto out_free;
 762
 763         r = -EFAULT;
 764         if (writeback && copy_to_user(user_msrs->entries, entries, size))
 765                 goto out_free;
 766
 767         r = n;
 768
 769 out_free:
 770         vfree(entries);
 771 out:
 772         return r;
 773 }
 774
 775 /*
 776  * Make sure that a cpu that is being hot-unplugged does not have any vcpus
 777  * cached on it.
 778  */
 779 void decache_vcpus_on_cpu(int cpu)
 780 {
 781         struct kvm *vm;
 782         struct kvm_vcpu *vcpu;
 783         int i;
 784
 785         spin_lock(&kvm_lock);
 786         list_for_each_entry(vm, &vm_list, vm_list)
 787                 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
 788                         vcpu = vm->vcpus[i];
 789                         if (!vcpu)
 790                                 continue;
 791                         /*
 792                          * If the vcpu is locked, then it is running on some
 793                          * other cpu and therefore it is not cached on the
 794                          * cpu in question.
 795                          *
 796                          * If it's not locked, check the last cpu it executed
 797                          * on.
 798                          */
 799                         if (mutex_trylock(&vcpu->mutex)) {
 800                                 if (vcpu->cpu == cpu) {
 801                                         kvm_x86_ops->vcpu_decache(vcpu);
 802                                         vcpu->cpu = -1;
 803                                 }
 804                                 mutex_unlock(&vcpu->mutex);
 805                         }
 806                 }
 807         spin_unlock(&kvm_lock);
 808 }
 809
 810 int kvm_dev_ioctl_check_extension(long ext)
 811 {
 812         int r;
 813
 814         switch (ext) {
 815         case KVM_CAP_IRQCHIP:
 816         case KVM_CAP_HLT:
 817         case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:
 818         case KVM_CAP_USER_MEMORY:
 819         case KVM_CAP_SET_TSS_ADDR:
 820         case KVM_CAP_EXT_CPUID:
 821         case KVM_CAP_CLOCKSOURCE:
 822         case KVM_CAP_PIT:
 823                 r = 1;
 824                 break;
 825         case KVM_CAP_VAPIC:
 826                 r = !kvm_x86_ops->cpu_has_accelerated_tpr();
 827                 break;
 828         case KVM_CAP_NR_VCPUS:
 829                 r = KVM_MAX_VCPUS;
 830                 break;
 831         case KVM_CAP_NR_MEMSLOTS:
 832                 r = KVM_MEMORY_SLOTS;
 833                 break;
 834         default:
 835                 r = 0;
 836                 break;
 837         }
 838         return r;
 839
 840 }
 841
 842 long kvm_arch_dev_ioctl(struct file *filp,
 843                         unsigned int ioctl, unsigned long arg)
 844 {
 845         void __user *argp = (void __user *)arg;
 846         long r;
 847
 848         switch (ioctl) {
 849         case KVM_GET_MSR_INDEX_LIST: {
 850                 struct kvm_msr_list __user *user_msr_list = argp;
 851                 struct kvm_msr_list msr_list;
 852                 unsigned n;
 853
 854                 r = -EFAULT;
 855                 if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list))
 856                         goto out;
 857                 n = msr_list.nmsrs;
 858                 msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs);
 859                 if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))
 860                         goto out;
 861                 r = -E2BIG;
 862                 if (n < num_msrs_to_save)
 863                         goto out;
 864                 r = -EFAULT;
 865                 if (copy_to_user(user_msr_list->indices, &msrs_to_save,
 866                                  num_msrs_to_save * sizeof(u32)))
 867                         goto out;
 868                 if (copy_to_user(user_msr_list->indices
 869                                  + num_msrs_to_save * sizeof(u32),
 870                                  &emulated_msrs,
 871                                  ARRAY_SIZE(emulated_msrs) * sizeof(u32)))
 872                         goto out;
 873                 r = 0;
 874                 break;
 875         }
 876         case KVM_GET_SUPPORTED_CPUID: {
 877                 struct kvm_cpuid2 __user *cpuid_arg = argp;
 878                 struct kvm_cpuid2 cpuid;
 879
 880                 r = -EFAULT;
 881                 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
 882                         goto out;
 883                 r = kvm_dev_ioctl_get_supported_cpuid(&cpuid,
 884                         cpuid_arg->entries);
 885                 if (r)
 886                         goto out;
 887
 888                 r = -EFAULT;
 889                 if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
 890                         goto out;
 891                 r = 0;
 892                 break;
 893         }
 894         default:
 895                 r = -EINVAL;
 896         }
 897 out:
 898         return r;
 899 }
 900
 901 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 902 {
 903         kvm_x86_ops->vcpu_load(vcpu, cpu);
 904         kvm_write_guest_time(vcpu);
 905 }
 906
 907 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 908 {
 909         kvm_x86_ops->vcpu_put(vcpu);
 910         kvm_put_guest_fpu(vcpu);
 911 }
 912
 913 static int is_efer_nx(void)
 914 {
 915         u64 efer;
 916
 917         rdmsrl(MSR_EFER, efer);
 918         return efer & EFER_NX;
 919 }
 920
 921 static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu)
 922 {
 923         int i;
 924         struct kvm_cpuid_entry2 *e, *entry;
 925
 926         entry = NULL;
 927         for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
 928                 e = &vcpu->arch.cpuid_entries[i];
 929                 if (e->function == 0x80000001) {
 930                         entry = e;
 931                         break;
 932                 }
 933         }
 934         if (entry && (entry->edx & (1 << 20)) && !is_efer_nx()) {
 935                 entry->edx &= ~(1 << 20);
 936                 printk(KERN_INFO "kvm: guest NX capability removed\n");
 937         }
 938 }
 939
 940 /* when an old userspace process fills a new kernel module */
 941 static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
 942                                     struct kvm_cpuid *cpuid,
 943                                     struct kvm_cpuid_entry __user *entries)
 944 {
 945         int r, i;
 946         struct kvm_cpuid_entry *cpuid_entries;
 947
 948         r = -E2BIG;
 949         if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
 950                 goto out;
 951         r = -ENOMEM;
 952         cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry) * cpuid->nent);
 953         if (!cpuid_entries)
 954                 goto out;
 955         r = -EFAULT;
 956         if (copy_from_user(cpuid_entries, entries,
 957                            cpuid->nent * sizeof(struct kvm_cpuid_entry)))
 958                 goto out_free;
 959         for (i = 0; i < cpuid->nent; i++) {
 960                 vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function;
 961                 vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax;
 962                 vcpu->arch.cpuid_entries[i].ebx = cpuid_entries[i].ebx;
 963                 vcpu->arch.cpuid_entries[i].ecx = cpuid_entries[i].ecx;
 964                 vcpu->arch.cpuid_entries[i].edx = cpuid_entries[i].edx;
 965                 vcpu->arch.cpuid_entries[i].index = 0;
 966                 vcpu->arch.cpuid_entries[i].flags = 0;
 967                 vcpu->arch.cpuid_entries[i].padding[0] = 0;
 968                 vcpu->arch.cpuid_entries[i].padding[1] = 0;
 969                 vcpu->arch.cpuid_entries[i].padding[2] = 0;
 970         }
 971         vcpu->arch.cpuid_nent = cpuid->nent;
 972         cpuid_fix_nx_cap(vcpu);
 973         r = 0;
 974
 975 out_free:
 976         vfree(cpuid_entries);
 977 out:
 978         return r;
 979 }
 980
 981 static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
 982                                     struct kvm_cpuid2 *cpuid,
 983                                     struct kvm_cpuid_entry2 __user *entries)
 984 {
 985         int r;
 986
 987         r = -E2BIG;
 988         if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
 989                 goto out;
 990         r = -EFAULT;
 991         if (copy_from_user(&vcpu->arch.cpuid_entries, entries,
 992                            cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
 993                 goto out;
 994         vcpu->arch.cpuid_nent = cpuid->nent;
 995         return 0;
 996
 997 out:
 998         return r;
 999 }
1000
1001 static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
1002                                     struct kvm_cpuid2 *cpuid,
1003                                     struct kvm_cpuid_entry2 __user *entries)
1004 {
1005         int r;
1006
1007         r = -E2BIG;
1008         if (cpuid->nent < vcpu->arch.cpuid_nent)
1009                 goto out;
1010         r = -EFAULT;
1011         if (copy_to_user(entries, &vcpu->arch.cpuid_entries,
1012                            vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2)))
1013                 goto out;
1014         return 0;
1015
1016 out:
1017         cpuid->nent = vcpu->arch.cpuid_nent;
1018         return r;
1019 }
1020
1021 static inline u32 bit(int bitno)
1022 {
1023         return 1 << (bitno & 31);
1024 }
1025
1026 static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1027                           u32 index)
1028 {
1029         entry->function = function;
1030         entry->index = index;
1031         cpuid_count(entry->function, entry->index,
1032                 &entry->eax, &entry->ebx, &entry->ecx, &entry->edx);
1033         entry->flags = 0;
1034 }
1035
1036 static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1037                          u32 index, int *nent, int maxnent)
1038 {
1039         const u32 kvm_supported_word0_x86_features = bit(X86_FEATURE_FPU) |
1040                 bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) |
1041                 bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) |
1042                 bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) |
1043                 bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) |
1044                 bit(X86_FEATURE_SEP) | bit(X86_FEATURE_PGE) |
1045                 bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) |
1046                 bit(X86_FEATURE_CLFLSH) | bit(X86_FEATURE_MMX) |
1047                 bit(X86_FEATURE_FXSR) | bit(X86_FEATURE_XMM) |
1048                 bit(X86_FEATURE_XMM2) | bit(X86_FEATURE_SELFSNOOP);
1049         const u32 kvm_supported_word1_x86_features = bit(X86_FEATURE_FPU) |
1050                 bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) |
1051                 bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) |
1052                 bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) |
1053                 bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) |
1054                 bit(X86_FEATURE_PGE) |
1055                 bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) |
1056                 bit(X86_FEATURE_MMX) | bit(X86_FEATURE_FXSR) |
1057                 bit(X86_FEATURE_SYSCALL) |
1058                 (bit(X86_FEATURE_NX) && is_efer_nx()) |
1059 #ifdef CONFIG_X86_64
1060                 bit(X86_FEATURE_LM) |
1061 #endif
1062                 bit(X86_FEATURE_MMXEXT) |
1063                 bit(X86_FEATURE_3DNOWEXT) |
1064                 bit(X86_FEATURE_3DNOW);
1065         const u32 kvm_supported_word3_x86_features =
1066                 bit(X86_FEATURE_XMM3) | bit(X86_FEATURE_CX16);
1067         const u32 kvm_supported_word6_x86_features =
1068                 bit(X86_FEATURE_LAHF_LM) | bit(X86_FEATURE_CMP_LEGACY);
1069
1070         /* all func 2 cpuid_count() should be called on the same cpu */
1071         get_cpu();
1072         do_cpuid_1_ent(entry, function, index);
1073         ++*nent;
1074
1075         switch (function) {
1076         case 0:
1077                 entry->eax = min(entry->eax, (u32)0xb);
1078                 break;
1079         case 1:
1080                 entry->edx &= kvm_supported_word0_x86_features;
1081                 entry->ecx &= kvm_supported_word3_x86_features;
1082                 break;
1083         /* function 2 entries are STATEFUL. That is, repeated cpuid commands
1084          * may return different values. This forces us to get_cpu() before
1085          * issuing the first command, and also to emulate this annoying behavior
1086          * in kvm_emulate_cpuid() using KVM_CPUID_FLAG_STATE_READ_NEXT */
1087         case 2: {
1088                 int t, times = entry->eax & 0xff;
1089
1090                 entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
1091                 for (t = 1; t < times && *nent < maxnent; ++t) {
1092                         do_cpuid_1_ent(&entry[t], function, 0);
1093                         entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
1094                         ++*nent;
1095                 }
1096                 break;
1097         }
1098         /* function 4 and 0xb have additional index. */
1099         case 4: {
1100                 int i, cache_type;
1101
1102                 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1103                 /* read more entries until cache_type is zero */
1104                 for (i = 1; *nent < maxnent; ++i) {
1105                         cache_type = entry[i - 1].eax & 0x1f;
1106                         if (!cache_type)
1107                                 break;
1108                         do_cpuid_1_ent(&entry[i], function, i);
1109                         entry[i].flags |=
1110                                KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1111                         ++*nent;
1112                 }
1113                 break;
1114         }
1115         case 0xb: {
1116                 int i, level_type;
1117
1118                 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1119                 /* read more entries until level_type is zero */
1120                 for (i = 1; *nent < maxnent; ++i) {
1121                         level_type = entry[i - 1].ecx & 0xff;
1122                         if (!level_type)
1123                                 break;
1124                         do_cpuid_1_ent(&entry[i], function, i);
1125                         entry[i].flags |=
1126                                KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1127                         ++*nent;
1128                 }
1129                 break;
1130         }
1131         case 0x80000000:
1132                 entry->eax = min(entry->eax, 0x8000001a);
1133                 break;
1134         case 0x80000001:
1135                 entry->edx &= kvm_supported_word1_x86_features;
1136                 entry->ecx &= kvm_supported_word6_x86_features;
1137                 break;
1138         }
1139         put_cpu();
1140 }
1141
1142 static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
1143                                     struct kvm_cpuid_entry2 __user *entries)
1144 {
1145         struct kvm_cpuid_entry2 *cpuid_entries;
1146         int limit, nent = 0, r = -E2BIG;
1147         u32 func;
1148
1149         if (cpuid->nent < 1)
1150                 goto out;
1151         r = -ENOMEM;
1152         cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent);
1153         if (!cpuid_entries)
1154                 goto out;
1155
1156         do_cpuid_ent(&cpuid_entries[0], 0, 0, &nent, cpuid->nent);
1157         limit = cpuid_entries[0].eax;
1158         for (func = 1; func <= limit && nent < cpuid->nent; ++func)
1159                 do_cpuid_ent(&cpuid_entries[nent], func, 0,
1160                                 &nent, cpuid->nent);
1161         r = -E2BIG;
1162         if (nent >= cpuid->nent)
1163                 goto out_free;
1164
1165         do_cpuid_ent(&cpuid_entries[nent], 0x80000000, 0, &nent, cpuid->nent);
1166         limit = cpuid_entries[nent - 1].eax;
1167         for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func)
1168                 do_cpuid_ent(&cpuid_entries[nent], func, 0,
1169                                &nent, cpuid->nent);
1170         r = -EFAULT;
1171         if (copy_to_user(entries, cpuid_entries,
1172                         nent * sizeof(struct kvm_cpuid_entry2)))
1173                 goto out_free;
1174         cpuid->nent = nent;
1175         r = 0;
1176
1177 out_free:
1178         vfree(cpuid_entries);
1179 out:
1180         return r;
1181 }
1182
1183 static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
1184                                     struct kvm_lapic_state *s)
1185 {
1186         vcpu_load(vcpu);
1187         memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s);
1188         vcpu_put(vcpu);
1189
1190         return 0;
1191 }
1192
1193 static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
1194                                     struct kvm_lapic_state *s)
1195 {
1196         vcpu_load(vcpu);
1197         memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s);
1198         kvm_apic_post_state_restore(vcpu);
1199         vcpu_put(vcpu);
1200
1201         return 0;
1202 }
1203
1204 static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
1205                                     struct kvm_interrupt *irq)
1206 {
1207         if (irq->irq < 0 || irq->irq >= 256)
1208                 return -EINVAL;
1209         if (irqchip_in_kernel(vcpu->kvm))
1210                 return -ENXIO;
1211         vcpu_load(vcpu);
1212
1213         set_bit(irq->irq, vcpu->arch.irq_pending);
1214         set_bit(irq->irq / BITS_PER_LONG, &vcpu->arch.irq_summary);
1215
1216         vcpu_put(vcpu);
1217
1218         return 0;
1219 }
1220
1221 static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu,
1222                                            struct kvm_tpr_access_ctl *tac)
1223 {
1224         if (tac->flags)
1225                 return -EINVAL;
1226         vcpu->arch.tpr_access_reporting = !!tac->enabled;
1227         return 0;
1228 }
1229
1230 long kvm_arch_vcpu_ioctl(struct file *filp,
1231                          unsigned int ioctl, unsigned long arg)
1232 {
1233         struct kvm_vcpu *vcpu = filp->private_data;
1234         void __user *argp = (void __user *)arg;
1235         int r;
1236
1237         switch (ioctl) {
1238         case KVM_GET_LAPIC: {
1239                 struct kvm_lapic_state lapic;
1240
1241                 memset(&lapic, 0, sizeof lapic);
1242                 r = kvm_vcpu_ioctl_get_lapic(vcpu, &lapic);
1243                 if (r)
1244                         goto out;
1245                 r = -EFAULT;
1246                 if (copy_to_user(argp, &lapic, sizeof lapic))
1247                         goto out;
1248                 r = 0;
1249                 break;
1250         }
1251         case KVM_SET_LAPIC: {
1252                 struct kvm_lapic_state lapic;
1253
1254                 r = -EFAULT;
1255                 if (copy_from_user(&lapic, argp, sizeof lapic))
1256                         goto out;
1257                 r = kvm_vcpu_ioctl_set_lapic(vcpu, &lapic);;
1258                 if (r)
1259                         goto out;
1260                 r = 0;
1261                 break;
1262         }
1263         case KVM_INTERRUPT: {
1264                 struct kvm_interrupt irq;
1265
1266                 r = -EFAULT;
1267                 if (copy_from_user(&irq, argp, sizeof irq))
1268                         goto out;
1269                 r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
1270                 if (r)
1271                         goto out;
1272                 r = 0;
1273                 break;
1274         }
1275         case KVM_SET_CPUID: {
1276                 struct kvm_cpuid __user *cpuid_arg = argp;
1277                 struct kvm_cpuid cpuid;
1278
1279                 r = -EFAULT;
1280                 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
1281                         goto out;
1282                 r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
1283                 if (r)
1284                         goto out;
1285                 break;
1286         }
1287         case KVM_SET_CPUID2: {
1288                 struct kvm_cpuid2 __user *cpuid_arg = argp;
1289                 struct kvm_cpuid2 cpuid;
1290
1291                 r = -EFAULT;
1292                 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
1293                         goto out;
1294                 r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid,
1295                                 cpuid_arg->entries);
1296                 if (r)
1297                         goto out;
1298                 break;
1299         }
1300         case KVM_GET_CPUID2: {
1301                 struct kvm_cpuid2 __user *cpuid_arg = argp;
1302                 struct kvm_cpuid2 cpuid;
1303
1304                 r = -EFAULT;
1305                 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
1306                         goto out;
1307                 r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid,
1308                                 cpuid_arg->entries);
1309                 if (r)
1310                         goto out;
1311                 r = -EFAULT;
1312                 if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
1313                         goto out;
1314                 r = 0;
1315                 break;
1316         }
1317         case KVM_GET_MSRS:
1318                 r = msr_io(vcpu, argp, kvm_get_msr, 1);
1319                 break;
1320         case KVM_SET_MSRS:
1321                 r = msr_io(vcpu, argp, do_set_msr, 0);
1322                 break;
1323         case KVM_TPR_ACCESS_REPORTING: {
1324                 struct kvm_tpr_access_ctl tac;
1325
1326                 r = -EFAULT;
1327                 if (copy_from_user(&tac, argp, sizeof tac))
1328                         goto out;
1329                 r = vcpu_ioctl_tpr_access_reporting(vcpu, &tac);
1330                 if (r)
1331                         goto out;
1332                 r = -EFAULT;
1333                 if (copy_to_user(argp, &tac, sizeof tac))
1334                         goto out;
1335                 r = 0;
1336                 break;
1337         };
1338         case KVM_SET_VAPIC_ADDR: {
1339                 struct kvm_vapic_addr va;
1340
1341                 r = -EINVAL;
1342                 if (!irqchip_in_kernel(vcpu->kvm))
1343                         goto out;
1344                 r = -EFAULT;
1345                 if (copy_from_user(&va, argp, sizeof va))
1346                         goto out;
1347                 r = 0;
1348                 kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr);
1349                 break;
1350         }
1351         default:
1352                 r = -EINVAL;
1353         }
1354 out:
1355         return r;
1356 }
1357
1358 static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr)
1359 {
1360         int ret;
1361
1362         if (addr > (unsigned int)(-3 * PAGE_SIZE))
1363                 return -1;
1364         ret = kvm_x86_ops->set_tss_addr(kvm, addr);
1365         return ret;
1366 }
1367
1368 static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
1369                                           u32 kvm_nr_mmu_pages)
1370 {
1371         if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES)
1372                 return -EINVAL;
1373
1374         down_write(&kvm->slots_lock);
1375
1376         kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);
1377         kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;
1378
1379         up_write(&kvm->slots_lock);
1380         return 0;
1381 }
1382
1383 static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
1384 {
1385         return kvm->arch.n_alloc_mmu_pages;
1386 }
1387
1388 gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
1389 {
1390         int i;
1391         struct kvm_mem_alias *alias;
1392
1393         for (i = 0; i < kvm->arch.naliases; ++i) {
1394                 alias = &kvm->arch.aliases[i];
1395                 if (gfn >= alias->base_gfn
1396                     && gfn < alias->base_gfn + alias->npages)
1397                         return alias->target_gfn + gfn - alias->base_gfn;
1398         }
1399         return gfn;
1400 }
1401
1402 /*
1403  * Set a new alias region.  Aliases map a portion of physical memory into
1404  * another portion.  This is useful for memory windows, for example the PC
1405  * VGA region.
1406  */
1407 static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
1408                                          struct kvm_memory_alias *alias)
1409 {
1410         int r, n;
1411         struct kvm_mem_alias *p;
1412
1413         r = -EINVAL;
1414         /* General sanity checks */
1415         if (alias->memory_size & (PAGE_SIZE - 1))
1416                 goto out;
1417         if (alias->guest_phys_addr & (PAGE_SIZE - 1))
1418                 goto out;
1419         if (alias->slot >= KVM_ALIAS_SLOTS)
1420                 goto out;
1421         if (alias->guest_phys_addr + alias->memory_size
1422             < alias->guest_phys_addr)
1423                 goto out;
1424         if (alias->target_phys_addr + alias->memory_size
1425             < alias->target_phys_addr)
1426                 goto out;
1427
1428         down_write(&kvm->slots_lock);
1429
1430         p = &kvm->arch.aliases[alias->slot];
1431         p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT;
1432         p->npages = alias->memory_size >> PAGE_SHIFT;
1433         p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT;
1434
1435         for (n = KVM_ALIAS_SLOTS; n > 0; --n)
1436                 if (kvm->arch.aliases[n - 1].npages)
1437                         break;
1438         kvm->arch.naliases = n;
1439
1440         kvm_mmu_zap_all(kvm);
1441
1442         up_write(&kvm->slots_lock);
1443
1444         return 0;
1445
1446 out:
1447         return r;
1448 }
1449
1450 static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
1451 {
1452         int r;
1453
1454         r = 0;
1455         switch (chip->chip_id) {
1456         case KVM_IRQCHIP_PIC_MASTER:
1457                 memcpy(&chip->chip.pic,
1458                         &pic_irqchip(kvm)->pics[0],
1459                         sizeof(struct kvm_pic_state));
1460                 break;
1461         case KVM_IRQCHIP_PIC_SLAVE:
1462                 memcpy(&chip->chip.pic,
1463                         &pic_irqchip(kvm)->pics[1],
1464                         sizeof(struct kvm_pic_state));
1465                 break;
1466         case KVM_IRQCHIP_IOAPIC:
1467                 memcpy(&chip->chip.ioapic,
1468                         ioapic_irqchip(kvm),
1469                         sizeof(struct kvm_ioapic_state));
1470                 break;
1471         default:
1472                 r = -EINVAL;
1473                 break;
1474         }
1475         return r;
1476 }
1477
1478 static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
1479 {
1480         int r;
1481
1482         r = 0;
1483         switch (chip->chip_id) {
1484         case KVM_IRQCHIP_PIC_MASTER:
1485                 memcpy(&pic_irqchip(kvm)->pics[0],
1486                         &chip->chip.pic,
1487                         sizeof(struct kvm_pic_state));
1488                 break;
1489         case KVM_IRQCHIP_PIC_SLAVE:
1490                 memcpy(&pic_irqchip(kvm)->pics[1],
1491                         &chip->chip.pic,
1492                         sizeof(struct kvm_pic_state));
1493                 break;
1494         case KVM_IRQCHIP_IOAPIC:
1495                 memcpy(ioapic_irqchip(kvm),
1496                         &chip->chip.ioapic,
1497                         sizeof(struct kvm_ioapic_state));
1498                 break;
1499         default:
1500                 r = -EINVAL;
1501                 break;
1502         }
1503         kvm_pic_update_irq(pic_irqchip(kvm));
1504         return r;
1505 }
1506
1507 static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps)
1508 {
1509         int r = 0;
1510
1511         memcpy(ps, &kvm->arch.vpit->pit_state, sizeof(struct kvm_pit_state));
1512         return r;
1513 }
1514
1515 static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
1516 {
1517         int r = 0;
1518
1519         memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state));
1520         kvm_pit_load_count(kvm, 0, ps->channels[0].count);
1521         return r;
1522 }
1523
1524 /*
1525  * Get (and clear) the dirty memory log for a memory slot.
1526  */
1527 int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
1528                                       struct kvm_dirty_log *log)
1529 {
1530         int r;
1531         int n;
1532         struct kvm_memory_slot *memslot;
1533         int is_dirty = 0;
1534
1535         down_write(&kvm->slots_lock);
1536
1537         r = kvm_get_dirty_log(kvm, log, &is_dirty);
1538         if (r)
1539                 goto out;
1540
1541         /* If nothing is dirty, don't bother messing with page tables. */
1542         if (is_dirty) {
1543                 kvm_mmu_slot_remove_write_access(kvm, log->slot);
1544                 kvm_flush_remote_tlbs(kvm);
1545                 memslot = &kvm->memslots[log->slot];
1546                 n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
1547                 memset(memslot->dirty_bitmap, 0, n);
1548         }
1549         r = 0;
1550 out:
1551         up_write(&kvm->slots_lock);
1552         return r;
1553 }
1554
1555 long kvm_arch_vm_ioctl(struct file *filp,
1556                        unsigned int ioctl, unsigned long arg)
1557 {
1558         struct kvm *kvm = filp->private_data;
1559         void __user *argp = (void __user *)arg;
1560         int r = -EINVAL;
1561
1562         switch (ioctl) {
1563         case KVM_SET_TSS_ADDR:
1564                 r = kvm_vm_ioctl_set_tss_addr(kvm, arg);
1565                 if (r < 0)
1566                         goto out;
1567                 break;
1568         case KVM_SET_MEMORY_REGION: {
1569                 struct kvm_memory_region kvm_mem;
1570                 struct kvm_userspace_memory_region kvm_userspace_mem;
1571
1572                 r = -EFAULT;
1573                 if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem))
1574                         goto out;
1575                 kvm_userspace_mem.slot = kvm_mem.slot;
1576                 kvm_userspace_mem.flags = kvm_mem.flags;
1577                 kvm_userspace_mem.guest_phys_addr = kvm_mem.guest_phys_addr;
1578                 kvm_userspace_mem.memory_size = kvm_mem.memory_size;
1579                 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 0);
1580                 if (r)
1581                         goto out;
1582                 break;
1583         }
1584         case KVM_SET_NR_MMU_PAGES:
1585                 r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);
1586                 if (r)
1587                         goto out;
1588                 break;
1589         case KVM_GET_NR_MMU_PAGES:
1590                 r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
1591                 break;
1592         case KVM_SET_MEMORY_ALIAS: {
1593                 struct kvm_memory_alias alias;
1594
1595                 r = -EFAULT;
1596                 if (copy_from_user(&alias, argp, sizeof alias))
1597                         goto out;
1598                 r = kvm_vm_ioctl_set_memory_alias(kvm, &alias);
1599                 if (r)
1600                         goto out;
1601                 break;
1602         }
1603         case KVM_CREATE_IRQCHIP:
1604                 r = -ENOMEM;
1605                 kvm->arch.vpic = kvm_create_pic(kvm);
1606                 if (kvm->arch.vpic) {
1607                         r = kvm_ioapic_init(kvm);
1608                         if (r) {
1609                                 kfree(kvm->arch.vpic);
1610                                 kvm->arch.vpic = NULL;
1611                                 goto out;
1612                         }
1613                 } else
1614                         goto out;
1615                 break;
1616         case KVM_CREATE_PIT:
1617                 r = -ENOMEM;
1618                 kvm->arch.vpit = kvm_create_pit(kvm);
1619                 if (kvm->arch.vpit)
1620                         r = 0;
1621                 break;
1622         case KVM_IRQ_LINE: {
1623                 struct kvm_irq_level irq_event;
1624
1625                 r = -EFAULT;
1626                 if (copy_from_user(&irq_event, argp, sizeof irq_event))
1627                         goto out;
1628                 if (irqchip_in_kernel(kvm)) {
1629                         mutex_lock(&kvm->lock);
1630                         if (irq_event.irq < 16)
1631                                 kvm_pic_set_irq(pic_irqchip(kvm),
1632                                         irq_event.irq,
1633                                         irq_event.level);
1634                         kvm_ioapic_set_irq(kvm->arch.vioapic,
1635                                         irq_event.irq,
1636                                         irq_event.level);
1637                         mutex_unlock(&kvm->lock);
1638                         r = 0;
1639                 }
1640                 break;
1641         }
1642         case KVM_GET_IRQCHIP: {
1643                 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
1644                 struct kvm_irqchip chip;
1645
1646                 r = -EFAULT;
1647                 if (copy_from_user(&chip, argp, sizeof chip))
1648                         goto out;
1649                 r = -ENXIO;
1650                 if (!irqchip_in_kernel(kvm))
1651                         goto out;
1652                 r = kvm_vm_ioctl_get_irqchip(kvm, &chip);
1653                 if (r)
1654                         goto out;
1655                 r = -EFAULT;
1656                 if (copy_to_user(argp, &chip, sizeof chip))
1657                         goto out;
1658                 r = 0;
1659                 break;
1660         }
1661         case KVM_SET_IRQCHIP: {
1662                 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
1663                 struct kvm_irqchip chip;
1664
1665                 r = -EFAULT;
1666                 if (copy_from_user(&chip, argp, sizeof chip))
1667                         goto out;
1668                 r = -ENXIO;
1669                 if (!irqchip_in_kernel(kvm))
1670                         goto out;
1671                 r = kvm_vm_ioctl_set_irqchip(kvm, &chip);
1672                 if (r)
1673                         goto out;
1674                 r = 0;
1675                 break;
1676         }
1677         case KVM_GET_PIT: {
1678                 struct kvm_pit_state ps;
1679                 r = -EFAULT;
1680                 if (copy_from_user(&ps, argp, sizeof ps))
1681                         goto out;
1682                 r = -ENXIO;
1683                 if (!kvm->arch.vpit)
1684                         goto out;
1685                 r = kvm_vm_ioctl_get_pit(kvm, &ps);
1686                 if (r)
1687                         goto out;
1688                 r = -EFAULT;
1689                 if (copy_to_user(argp, &ps, sizeof ps))
1690                         goto out;
1691                 r = 0;
1692                 break;
1693         }
1694         case KVM_SET_PIT: {
1695                 struct kvm_pit_state ps;
1696                 r = -EFAULT;
1697                 if (copy_from_user(&ps, argp, sizeof ps))
1698                         goto out;
1699                 r = -ENXIO;
1700                 if (!kvm->arch.vpit)
1701                         goto out;
1702                 r = kvm_vm_ioctl_set_pit(kvm, &ps);
1703                 if (r)
1704                         goto out;
1705                 r = 0;
1706                 break;
1707         }
1708         default:
1709                 ;
1710         }
1711 out:
1712         return r;
1713 }
1714
1715 static void kvm_init_msr_list(void)
1716 {
1717         u32 dummy[2];
1718         unsigned i, j;
1719
1720         for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) {
1721                 if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
1722                         continue;
1723                 if (j < i)
1724                         msrs_to_save[j] = msrs_to_save[i];
1725                 j++;
1726         }
1727         num_msrs_to_save = j;
1728 }
1729
1730 /*
1731  * Only apic need an MMIO device hook, so shortcut now..
1732  */
1733 static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu,
1734                                                 gpa_t addr)
1735 {
1736         struct kvm_io_device *dev;
1737
1738         if (vcpu->arch.apic) {
1739                 dev = &vcpu->arch.apic->dev;
1740                 if (dev->in_range(dev, addr))
1741                         return dev;
1742         }
1743         return NULL;
1744 }
1745
1746
1747 static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu,
1748                                                 gpa_t addr)
1749 {
1750         struct kvm_io_device *dev;
1751
1752         dev = vcpu_find_pervcpu_dev(vcpu, addr);
1753         if (dev == NULL)
1754                 dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr);
1755         return dev;
1756 }
1757
1758 int emulator_read_std(unsigned long addr,
1759                              void *val,
1760                              unsigned int bytes,
1761                              struct kvm_vcpu *vcpu)
1762 {
1763         void *data = val;
1764         int r = X86EMUL_CONTINUE;
1765
1766         down_read(&vcpu->kvm->slots_lock);
1767         while (bytes) {
1768                 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
1769                 unsigned offset = addr & (PAGE_SIZE-1);
1770                 unsigned tocopy = min(bytes, (unsigned)PAGE_SIZE - offset);
1771                 int ret;
1772
1773                 if (gpa == UNMAPPED_GVA) {
1774                         r = X86EMUL_PROPAGATE_FAULT;
1775                         goto out;
1776                 }
1777                 ret = kvm_read_guest(vcpu->kvm, gpa, data, tocopy);
1778                 if (ret < 0) {
1779                         r = X86EMUL_UNHANDLEABLE;
1780                         goto out;
1781                 }
1782
1783                 bytes -= tocopy;
1784                 data += tocopy;
1785                 addr += tocopy;
1786         }
1787 out:
1788         up_read(&vcpu->kvm->slots_lock);
1789         return r;
1790 }
1791 EXPORT_SYMBOL_GPL(emulator_read_std);
1792
1793 static int emulator_read_emulated(unsigned long addr,
1794                                   void *val,
1795                                   unsigned int bytes,
1796                                   struct kvm_vcpu *vcpu)
1797 {
1798         struct kvm_io_device *mmio_dev;
1799         gpa_t                 gpa;
1800
1801         if (vcpu->mmio_read_completed) {
1802                 memcpy(val, vcpu->mmio_data, bytes);
1803                 vcpu->mmio_read_completed = 0;
1804                 return X86EMUL_CONTINUE;
1805         }
1806
1807         down_read(&vcpu->kvm->slots_lock);
1808         gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
1809         up_read(&vcpu->kvm->slots_lock);
1810
1811         /* For APIC access vmexit */
1812         if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
1813                 goto mmio;
1814
1815         if (emulator_read_std(addr, val, bytes, vcpu)
1816                         == X86EMUL_CONTINUE)
1817                 return X86EMUL_CONTINUE;
1818         if (gpa == UNMAPPED_GVA)
1819                 return X86EMUL_PROPAGATE_FAULT;
1820
1821 mmio:
1822         /*
1823          * Is this MMIO handled locally?
1824          */
1825         mutex_lock(&vcpu->kvm->lock);
1826         mmio_dev = vcpu_find_mmio_dev(vcpu, gpa);
1827         if (mmio_dev) {
1828                 kvm_iodevice_read(mmio_dev, gpa, bytes, val);
1829                 mutex_unlock(&vcpu->kvm->lock);
1830                 return X86EMUL_CONTINUE;
1831         }
1832         mutex_unlock(&vcpu->kvm->lock);
1833
1834         vcpu->mmio_needed = 1;
1835         vcpu->mmio_phys_addr = gpa;
1836         vcpu->mmio_size = bytes;
1837         vcpu->mmio_is_write = 0;
1838
1839         return X86EMUL_UNHANDLEABLE;
1840 }
1841
1842 static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
1843                                const void *val, int bytes)
1844 {
1845         int ret;
1846
1847         down_read(&vcpu->kvm->slots_lock);
1848         ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes);
1849         if (ret < 0) {
1850                 up_read(&vcpu->kvm->slots_lock);
1851                 return 0;
1852         }
1853         kvm_mmu_pte_write(vcpu, gpa, val, bytes);
1854         up_read(&vcpu->kvm->slots_lock);
1855         return 1;
1856 }
1857
1858 static int emulator_write_emulated_onepage(unsigned long addr,
1859                                            const void *val,
1860                                            unsigned int bytes,
1861                                            struct kvm_vcpu *vcpu)
1862 {
1863         struct kvm_io_device *mmio_dev;
1864         gpa_t                 gpa;
1865
1866         down_read(&vcpu->kvm->slots_lock);
1867         gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
1868         up_read(&vcpu->kvm->slots_lock);
1869
1870         if (gpa == UNMAPPED_GVA) {
1871                 kvm_inject_page_fault(vcpu, addr, 2);
1872                 return X86EMUL_PROPAGATE_FAULT;
1873         }
1874
1875         /* For APIC access vmexit */
1876         if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
1877                 goto mmio;
1878
1879         if (emulator_write_phys(vcpu, gpa, val, bytes))
1880                 return X86EMUL_CONTINUE;
1881
1882 mmio:
1883         /*
1884          * Is this MMIO handled locally?
1885          */
1886         mutex_lock(&vcpu->kvm->lock);
1887         mmio_dev = vcpu_find_mmio_dev(vcpu, gpa);
1888         if (mmio_dev) {
1889                 kvm_iodevice_write(mmio_dev, gpa, bytes, val);
1890                 mutex_unlock(&vcpu->kvm->lock);
1891                 return X86EMUL_CONTINUE;
1892         }
1893         mutex_unlock(&vcpu->kvm->lock);
1894
1895         vcpu->mmio_needed = 1;
1896         vcpu->mmio_phys_addr = gpa;
1897         vcpu->mmio_size = bytes;
1898         vcpu->mmio_is_write = 1;
1899         memcpy(vcpu->mmio_data, val, bytes);
1900
1901         return X86EMUL_CONTINUE;
1902 }
1903
1904 int emulator_write_emulated(unsigned long addr,
1905                                    const void *val,
1906                                    unsigned int bytes,
1907                                    struct kvm_vcpu *vcpu)
1908 {
1909         /* Crossing a page boundary? */
1910         if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {
1911                 int rc, now;
1912
1913                 now = -addr & ~PAGE_MASK;
1914                 rc = emulator_write_emulated_onepage(addr, val, now, vcpu);
1915                 if (rc != X86EMUL_CONTINUE)
1916                         return rc;
1917                 addr += now;
1918                 val += now;
1919                 bytes -= now;
1920         }
1921         return emulator_write_emulated_onepage(addr, val, bytes, vcpu);
1922 }
1923 EXPORT_SYMBOL_GPL(emulator_write_emulated);
1924
1925 static int emulator_cmpxchg_emulated(unsigned long addr,
1926                                      const void *old,
1927                                      const void *new,
1928                                      unsigned int bytes,
1929                                      struct kvm_vcpu *vcpu)
1930 {
1931         static int reported;
1932
1933         if (!reported) {
1934                 reported = 1;
1935                 printk(KERN_WARNING "kvm: emulating exchange as write\n");
1936         }
1937 #ifndef CONFIG_X86_64
1938         /* guests cmpxchg8b have to be emulated atomically */
1939         if (bytes == 8) {
1940                 gpa_t gpa;
1941                 struct page *page;
1942                 char *kaddr;
1943                 u64 val;
1944
1945                 down_read(&vcpu->kvm->slots_lock);
1946                 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
1947
1948                 if (gpa == UNMAPPED_GVA ||
1949                    (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
1950                         goto emul_write;
1951
1952                 if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK))
1953                         goto emul_write;
1954
1955                 val = *(u64 *)new;
1956
1957                 down_read(&current->mm->mmap_sem);
1958                 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
1959                 up_read(&current->mm->mmap_sem);
1960
1961                 kaddr = kmap_atomic(page, KM_USER0);
1962                 set_64bit((u64 *)(kaddr + offset_in_page(gpa)), val);
1963                 kunmap_atomic(kaddr, KM_USER0);
1964                 kvm_release_page_dirty(page);
1965         emul_write:
1966                 up_read(&vcpu->kvm->slots_lock);
1967         }
1968 #endif
1969
1970         return emulator_write_emulated(addr, new, bytes, vcpu);
1971 }
1972
1973 static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
1974 {
1975         return kvm_x86_ops->get_segment_base(vcpu, seg);
1976 }
1977
1978 int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
1979 {
1980         return X86EMUL_CONTINUE;
1981 }
1982
1983 int emulate_clts(struct kvm_vcpu *vcpu)
1984 {
1985         kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS);
1986         return X86EMUL_CONTINUE;
1987 }
1988
1989 int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest)
1990 {
1991         struct kvm_vcpu *vcpu = ctxt->vcpu;
1992
1993         switch (dr) {
1994         case 0 ... 3:
1995                 *dest = kvm_x86_ops->get_dr(vcpu, dr);
1996                 return X86EMUL_CONTINUE;
1997         default:
1998                 pr_unimpl(vcpu, "%s: unexpected dr %u\n", __func__, dr);
1999                 return X86EMUL_UNHANDLEABLE;
2000         }
2001 }
2002
2003 int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
2004 {
2005         unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U;
2006         int exception;
2007
2008         kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask, &exception);
2009         if (exception) {
2010                 /* FIXME: better handling */
2011                 return X86EMUL_UNHANDLEABLE;
2012         }
2013         return X86EMUL_CONTINUE;
2014 }
2015
2016 void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
2017 {
2018         static int reported;
2019         u8 opcodes[4];
2020         unsigned long rip = vcpu->arch.rip;
2021         unsigned long rip_linear;
2022
2023         rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS);
2024
2025         if (reported)
2026                 return;
2027
2028         emulator_read_std(rip_linear, (void *)opcodes, 4, vcpu);
2029
2030         printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n",
2031                context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
2032         reported = 1;
2033 }
2034 EXPORT_SYMBOL_GPL(kvm_report_emulation_failure);
2035
2036 static struct x86_emulate_ops emulate_ops = {
2037         .read_std            = emulator_read_std,
2038         .read_emulated       = emulator_read_emulated,
2039         .write_emulated      = emulator_write_emulated,
2040         .cmpxchg_emulated    = emulator_cmpxchg_emulated,
2041 };
2042
2043 int emulate_instruction(struct kvm_vcpu *vcpu,
2044                         struct kvm_run *run,
2045                         unsigned long cr2,
2046                         u16 error_code,
2047                         int emulation_type)
2048 {
2049         int r;
2050         struct decode_cache *c;
2051
2052         vcpu->arch.mmio_fault_cr2 = cr2;
2053         kvm_x86_ops->cache_regs(vcpu);
2054
2055         vcpu->mmio_is_write = 0;
2056         vcpu->arch.pio.string = 0;
2057
2058         if (!(emulation_type & EMULTYPE_NO_DECODE)) {
2059                 int cs_db, cs_l;
2060                 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
2061
2062                 vcpu->arch.emulate_ctxt.vcpu = vcpu;
2063                 vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
2064                 vcpu->arch.emulate_ctxt.mode =
2065                         (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
2066                         ? X86EMUL_MODE_REAL : cs_l
2067                         ? X86EMUL_MODE_PROT64 : cs_db
2068                         ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
2069
2070                 if (vcpu->arch.emulate_ctxt.mode == X86EMUL_MODE_PROT64) {
2071                         vcpu->arch.emulate_ctxt.cs_base = 0;
2072                         vcpu->arch.emulate_ctxt.ds_base = 0;
2073                         vcpu->arch.emulate_ctxt.es_base = 0;
2074                         vcpu->arch.emulate_ctxt.ss_base = 0;
2075                 } else {
2076                         vcpu->arch.emulate_ctxt.cs_base =
2077                                         get_segment_base(vcpu, VCPU_SREG_CS);
2078                         vcpu->arch.emulate_ctxt.ds_base =
2079                                         get_segment_base(vcpu, VCPU_SREG_DS);
2080                         vcpu->arch.emulate_ctxt.es_base =
2081                                         get_segment_base(vcpu, VCPU_SREG_ES);
2082                         vcpu->arch.emulate_ctxt.ss_base =
2083                                         get_segment_base(vcpu, VCPU_SREG_SS);
2084                 }
2085
2086                 vcpu->arch.emulate_ctxt.gs_base =
2087                                         get_segment_base(vcpu, VCPU_SREG_GS);
2088                 vcpu->arch.emulate_ctxt.fs_base =
2089                                         get_segment_base(vcpu, VCPU_SREG_FS);
2090
2091                 r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
2092
2093                 /* Reject the instructions other than VMCALL/VMMCALL when
2094                  * try to emulate invalid opcode */
2095                 c = &vcpu->arch.emulate_ctxt.decode;
2096                 if ((emulation_type & EMULTYPE_TRAP_UD) &&
2097                     (!(c->twobyte && c->b == 0x01 &&
2098                       (c->modrm_reg == 0 || c->modrm_reg == 3) &&
2099                        c->modrm_mod == 3 && c->modrm_rm == 1)))
2100                         return EMULATE_FAIL;
2101
2102                 ++vcpu->stat.insn_emulation;
2103                 if (r)  {
2104                         ++vcpu->stat.insn_emulation_fail;
2105                         if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
2106                                 return EMULATE_DONE;
2107                         return EMULATE_FAIL;
2108                 }
2109         }
2110
2111         r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
2112
2113         if (vcpu->arch.pio.string)
2114                 return EMULATE_DO_MMIO;
2115
2116         if ((r || vcpu->mmio_is_write) && run) {
2117                 run->exit_reason = KVM_EXIT_MMIO;
2118                 run->mmio.phys_addr = vcpu->mmio_phys_addr;
2119                 memcpy(run->mmio.data, vcpu->mmio_data, 8);
2120                 run->mmio.len = vcpu->mmio_size;
2121                 run->mmio.is_write = vcpu->mmio_is_write;
2122         }
2123
2124         if (r) {
2125                 if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
2126                         return EMULATE_DONE;
2127                 if (!vcpu->mmio_needed) {
2128                         kvm_report_emulation_failure(vcpu, "mmio");
2129                         return EMULATE_FAIL;
2130                 }
2131                 return EMULATE_DO_MMIO;
2132         }
2133
2134         kvm_x86_ops->decache_regs(vcpu);
2135         kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
2136
2137         if (vcpu->mmio_is_write) {
2138                 vcpu->mmio_needed = 0;
2139                 return EMULATE_DO_MMIO;
2140         }
2141
2142         return EMULATE_DONE;
2143 }
2144 EXPORT_SYMBOL_GPL(emulate_instruction);
2145
2146 static void free_pio_guest_pages(struct kvm_vcpu *vcpu)
2147 {
2148         int i;
2149
2150         for (i = 0; i < ARRAY_SIZE(vcpu->arch.pio.guest_pages); ++i)
2151                 if (vcpu->arch.pio.guest_pages[i]) {
2152                         kvm_release_page_dirty(vcpu->arch.pio.guest_pages[i]);
2153                         vcpu->arch.pio.guest_pages[i] = NULL;
2154                 }
2155 }
2156
2157 static int pio_copy_data(struct kvm_vcpu *vcpu)
2158 {
2159         void *p = vcpu->arch.pio_data;
2160         void *q;
2161         unsigned bytes;
2162         int nr_pages = vcpu->arch.pio.guest_pages[1] ? 2 : 1;
2163
2164         q = vmap(vcpu->arch.pio.guest_pages, nr_pages, VM_READ|VM_WRITE,
2165                  PAGE_KERNEL);
2166         if (!q) {
2167                 free_pio_guest_pages(vcpu);
2168                 return -ENOMEM;
2169         }
2170         q += vcpu->arch.pio.guest_page_offset;
2171         bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count;
2172         if (vcpu->arch.pio.in)
2173                 memcpy(q, p, bytes);
2174         else
2175                 memcpy(p, q, bytes);
2176         q -= vcpu->arch.pio.guest_page_offset;
2177         vunmap(q);
2178         free_pio_guest_pages(vcpu);
2179         return 0;
2180 }
2181
2182 int complete_pio(struct kvm_vcpu *vcpu)
2183 {
2184         struct kvm_pio_request *io = &vcpu->arch.pio;
2185         long delta;
2186         int r;
2187
2188         kvm_x86_ops->cache_regs(vcpu);
2189
2190         if (!io->string) {
2191                 if (io->in)
2192                         memcpy(&vcpu->arch.regs[VCPU_REGS_RAX], vcpu->arch.pio_data,
2193                                io->size);
2194         } else {
2195                 if (io->in) {
2196                         r = pio_copy_data(vcpu);
2197                         if (r) {
2198                                 kvm_x86_ops->cache_regs(vcpu);
2199                                 return r;
2200                         }
2201                 }
2202
2203                 delta = 1;
2204                 if (io->rep) {
2205                         delta *= io->cur_count;
2206                         /*
2207                          * The size of the register should really depend on
2208                          * current address size.
2209                          */
2210                         vcpu->arch.regs[VCPU_REGS_RCX] -= delta;
2211                 }
2212                 if (io->down)
2213                         delta = -delta;
2214                 delta *= io->size;
2215                 if (io->in)
2216                         vcpu->arch.regs[VCPU_REGS_RDI] += delta;
2217                 else
2218                         vcpu->arch.regs[VCPU_REGS_RSI] += delta;
2219         }
2220
2221         kvm_x86_ops->decache_regs(vcpu);
2222
2223         io->count -= io->cur_count;
2224         io->cur_count = 0;
2225
2226         return 0;
2227 }
2228
2229 static void kernel_pio(struct kvm_io_device *pio_dev,
2230                        struct kvm_vcpu *vcpu,
2231                        void *pd)
2232 {
2233         /* TODO: String I/O for in kernel device */
2234
2235         mutex_lock(&vcpu->kvm->lock);
2236         if (vcpu->arch.pio.in)
2237                 kvm_iodevice_read(pio_dev, vcpu->arch.pio.port,
2238                                   vcpu->arch.pio.size,
2239                                   pd);
2240         else
2241                 kvm_iodevice_write(pio_dev, vcpu->arch.pio.port,
2242                                    vcpu->arch.pio.size,
2243                                    pd);
2244         mutex_unlock(&vcpu->kvm->lock);
2245 }
2246
2247 static void pio_string_write(struct kvm_io_device *pio_dev,
2248                              struct kvm_vcpu *vcpu)
2249 {
2250         struct kvm_pio_request *io = &vcpu->arch.pio;
2251         void *pd = vcpu->arch.pio_data;
2252         int i;
2253
2254         mutex_lock(&vcpu->kvm->lock);
2255         for (i = 0; i < io->cur_count; i++) {
2256                 kvm_iodevice_write(pio_dev, io->port,
2257                                    io->size,
2258                                    pd);
2259                 pd += io->size;
2260         }
2261         mutex_unlock(&vcpu->kvm->lock);
2262 }
2263
2264 static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu,
2265                                                gpa_t addr)
2266 {
2267         return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr);
2268 }
2269
2270 int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
2271                   int size, unsigned port)
2272 {
2273         struct kvm_io_device *pio_dev;
2274
2275         vcpu->run->exit_reason = KVM_EXIT_IO;
2276         vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
2277         vcpu->run->io.size = vcpu->arch.pio.size = size;
2278         vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
2279         vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = 1;
2280         vcpu->run->io.port = vcpu->arch.pio.port = port;
2281         vcpu->arch.pio.in = in;
2282         vcpu->arch.pio.string = 0;
2283         vcpu->arch.pio.down = 0;
2284         vcpu->arch.pio.guest_page_offset = 0;
2285         vcpu->arch.pio.rep = 0;
2286
2287         kvm_x86_ops->cache_regs(vcpu);
2288         memcpy(vcpu->arch.pio_data, &vcpu->arch.regs[VCPU_REGS_RAX], 4);
2289         kvm_x86_ops->decache_regs(vcpu);
2290
2291         kvm_x86_ops->skip_emulated_instruction(vcpu);
2292
2293         pio_dev = vcpu_find_pio_dev(vcpu, port);
2294         if (pio_dev) {
2295                 kernel_pio(pio_dev, vcpu, vcpu->arch.pio_data);
2296                 complete_pio(vcpu);
2297                 return 1;
2298         }
2299         return 0;
2300 }
2301 EXPORT_SYMBOL_GPL(kvm_emulate_pio);
2302
2303 int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
2304                   int size, unsigned long count, int down,
2305                   gva_t address, int rep, unsigned port)
2306 {
2307         unsigned now, in_page;
2308         int i, ret = 0;
2309         int nr_pages = 1;
2310         struct page *page;
2311         struct kvm_io_device *pio_dev;
2312
2313         vcpu->run->exit_reason = KVM_EXIT_IO;
2314         vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
2315         vcpu->run->io.size = vcpu->arch.pio.size = size;
2316         vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
2317         vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = count;
2318         vcpu->run->io.port = vcpu->arch.pio.port = port;
2319         vcpu->arch.pio.in = in;
2320         vcpu->arch.pio.string = 1;
2321         vcpu->arch.pio.down = down;
2322         vcpu->arch.pio.guest_page_offset = offset_in_page(address);
2323         vcpu->arch.pio.rep = rep;
2324
2325         if (!count) {
2326                 kvm_x86_ops->skip_emulated_instruction(vcpu);
2327                 return 1;
2328         }
2329
2330         if (!down)
2331                 in_page = PAGE_SIZE - offset_in_page(address);
2332         else
2333                 in_page = offset_in_page(address) + size;
2334         now = min(count, (unsigned long)in_page / size);
2335         if (!now) {
2336                 /*
2337                  * String I/O straddles page boundary.  Pin two guest pages
2338                  * so that we satisfy atomicity constraints.  Do just one
2339                  * transaction to avoid complexity.
2340                  */
2341                 nr_pages = 2;
2342                 now = 1;
2343         }
2344         if (down) {
2345                 /*
2346                  * String I/O in reverse.  Yuck.  Kill the guest, fix later.
2347                  */
2348                 pr_unimpl(vcpu, "guest string pio down\n");
2349                 kvm_inject_gp(vcpu, 0);
2350                 return 1;
2351         }
2352         vcpu->run->io.count = now;
2353         vcpu->arch.pio.cur_count = now;
2354
2355         if (vcpu->arch.pio.cur_count == vcpu->arch.pio.count)
2356                 kvm_x86_ops->skip_emulated_instruction(vcpu);
2357
2358         for (i = 0; i < nr_pages; ++i) {
2359                 down_read(&vcpu->kvm->slots_lock);
2360                 page = gva_to_page(vcpu, address + i * PAGE_SIZE);
2361                 vcpu->arch.pio.guest_pages[i] = page;
2362                 up_read(&vcpu->kvm->slots_lock);
2363                 if (!page) {
2364                         kvm_inject_gp(vcpu, 0);
2365                         free_pio_guest_pages(vcpu);
2366                         return 1;
2367                 }
2368         }
2369
2370         pio_dev = vcpu_find_pio_dev(vcpu, port);
2371         if (!vcpu->arch.pio.in) {
2372                 /* string PIO write */
2373                 ret = pio_copy_data(vcpu);
2374                 if (ret >= 0 && pio_dev) {
2375                         pio_string_write(pio_dev, vcpu);
2376                         complete_pio(vcpu);
2377                         if (vcpu->arch.pio.count == 0)
2378                                 ret = 1;
2379                 }
2380         } else if (pio_dev)
2381                 pr_unimpl(vcpu, "no string pio read support yet, "
2382                        "port %x size %d count %ld\n",
2383                         port, size, count);
2384
2385         return ret;
2386 }
2387 EXPORT_SYMBOL_GPL(kvm_emulate_pio_string);
2388
2389 int kvm_arch_init(void *opaque)
2390 {
2391         int r;
2392         struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque;
2393
2394         if (kvm_x86_ops) {
2395                 printk(KERN_ERR "kvm: already loaded the other module\n");
2396                 r = -EEXIST;
2397                 goto out;
2398         }
2399
2400         if (!ops->cpu_has_kvm_support()) {
2401                 printk(KERN_ERR "kvm: no hardware support\n");
2402                 r = -EOPNOTSUPP;
2403                 goto out;
2404         }
2405         if (ops->disabled_by_bios()) {
2406                 printk(KERN_ERR "kvm: disabled by bios\n");
2407                 r = -EOPNOTSUPP;
2408                 goto out;
2409         }
2410
2411         r = kvm_mmu_module_init();
2412         if (r)
2413                 goto out;
2414
2415         kvm_init_msr_list();
2416
2417         kvm_x86_ops = ops;
2418         kvm_mmu_set_nonpresent_ptes(0ull, 0ull);
2419         return 0;
2420
2421 out:
2422         return r;
2423 }
2424
2425 void kvm_arch_exit(void)
2426 {
2427         kvm_x86_ops = NULL;
2428         kvm_mmu_module_exit();
2429 }
2430
2431 int kvm_emulate_halt(struct kvm_vcpu *vcpu)
2432 {
2433         ++vcpu->stat.halt_exits;
2434         if (irqchip_in_kernel(vcpu->kvm)) {
2435                 vcpu->arch.mp_state = VCPU_MP_STATE_HALTED;
2436                 kvm_vcpu_block(vcpu);
2437                 if (vcpu->arch.mp_state != VCPU_MP_STATE_RUNNABLE)
2438                         return -EINTR;
2439                 return 1;
2440         } else {
2441                 vcpu->run->exit_reason = KVM_EXIT_HLT;
2442                 return 0;
2443         }
2444 }
2445 EXPORT_SYMBOL_GPL(kvm_emulate_halt);
2446
2447 int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
2448 {
2449         unsigned long nr, a0, a1, a2, a3, ret;
2450
2451         kvm_x86_ops->cache_regs(vcpu);
2452
2453         nr = vcpu->arch.regs[VCPU_REGS_RAX];
2454         a0 = vcpu->arch.regs[VCPU_REGS_RBX];
2455         a1 = vcpu->arch.regs[VCPU_REGS_RCX];
2456         a2 = vcpu->arch.regs[VCPU_REGS_RDX];
2457         a3 = vcpu->arch.regs[VCPU_REGS_RSI];
2458
2459         if (!is_long_mode(vcpu)) {
2460                 nr &= 0xFFFFFFFF;
2461                 a0 &= 0xFFFFFFFF;
2462                 a1 &= 0xFFFFFFFF;
2463                 a2 &= 0xFFFFFFFF;
2464                 a3 &= 0xFFFFFFFF;
2465         }
2466
2467         switch (nr) {
2468         case KVM_HC_VAPIC_POLL_IRQ:
2469                 ret = 0;
2470                 break;
2471         default:
2472                 ret = -KVM_ENOSYS;
2473                 break;
2474         }
2475         vcpu->arch.regs[VCPU_REGS_RAX] = ret;
2476         kvm_x86_ops->decache_regs(vcpu);
2477         ++vcpu->stat.hypercalls;
2478         return 0;
2479 }
2480 EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
2481
2482 int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
2483 {
2484         char instruction[3];
2485         int ret = 0;
2486
2487
2488         /*
2489          * Blow out the MMU to ensure that no other VCPU has an active mapping
2490          * to ensure that the updated hypercall appears atomically across all
2491          * VCPUs.
2492          */
2493         kvm_mmu_zap_all(vcpu->kvm);
2494
2495         kvm_x86_ops->cache_regs(vcpu);
2496         kvm_x86_ops->patch_hypercall(vcpu, instruction);
2497         if (emulator_write_emulated(vcpu->arch.rip, instruction, 3, vcpu)
2498             != X86EMUL_CONTINUE)
2499                 ret = -EFAULT;
2500
2501         return ret;
2502 }
2503
2504 static u64 mk_cr_64(u64 curr_cr, u32 new_val)
2505 {
2506         return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
2507 }
2508
2509 void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
2510 {
2511         struct descriptor_table dt = { limit, base };
2512
2513         kvm_x86_ops->set_gdt(vcpu, &dt);
2514 }
2515
2516 void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
2517 {
2518         struct descriptor_table dt = { limit, base };
2519
2520         kvm_x86_ops->set_idt(vcpu, &dt);
2521 }
2522
2523 void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
2524                    unsigned long *rflags)
2525 {
2526         kvm_lmsw(vcpu, msw);
2527         *rflags = kvm_x86_ops->get_rflags(vcpu);
2528 }
2529
2530 unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
2531 {
2532         kvm_x86_ops->decache_cr4_guest_bits(vcpu);
2533         switch (cr) {
2534         case 0:
2535                 return vcpu->arch.cr0;
2536         case 2:
2537                 return vcpu->arch.cr2;
2538         case 3:
2539                 return vcpu->arch.cr3;
2540         case 4:
2541                 return vcpu->arch.cr4;
2542         case 8:
2543                 return kvm_get_cr8(vcpu);
2544         default:
2545                 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
2546                 return 0;
2547         }
2548 }
2549
2550 void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
2551                      unsigned long *rflags)
2552 {
2553         switch (cr) {
2554         case 0:
2555                 kvm_set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val));
2556                 *rflags = kvm_x86_ops->get_rflags(vcpu);
2557                 break;
2558         case 2:
2559                 vcpu->arch.cr2 = val;
2560                 break;
2561         case 3:
2562                 kvm_set_cr3(vcpu, val);
2563                 break;
2564         case 4:
2565                 kvm_set_cr4(vcpu, mk_cr_64(vcpu->arch.cr4, val));
2566                 break;
2567         case 8:
2568                 kvm_set_cr8(vcpu, val & 0xfUL);
2569                 break;
2570         default:
2571                 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
2572         }
2573 }
2574
2575 static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i)
2576 {
2577         struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i];
2578         int j, nent = vcpu->arch.cpuid_nent;
2579
2580         e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT;
2581         /* when no next entry is found, the current entry[i] is reselected */
2582         for (j = i + 1; j == i; j = (j + 1) % nent) {
2583                 struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j];
2584                 if (ej->function == e->function) {
2585                         ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
2586                         return j;
2587                 }
2588         }
2589         return 0; /* silence gcc, even though control never reaches here */
2590 }
2591
2592 /* find an entry with matching function, matching index (if needed), and that
2593  * should be read next (if it's stateful) */
2594 static int is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e,
2595         u32 function, u32 index)
2596 {
2597         if (e->function != function)
2598                 return 0;
2599         if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index)
2600                 return 0;
2601         if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) &&
2602                 !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT))
2603                 return 0;
2604         return 1;
2605 }
2606
2607 void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
2608 {
2609         int i;
2610         u32 function, index;
2611         struct kvm_cpuid_entry2 *e, *best;
2612
2613         kvm_x86_ops->cache_regs(vcpu);
2614         function = vcpu->arch.regs[VCPU_REGS_RAX];
2615         index = vcpu->arch.regs[VCPU_REGS_RCX];
2616         vcpu->arch.regs[VCPU_REGS_RAX] = 0;
2617         vcpu->arch.regs[VCPU_REGS_RBX] = 0;
2618         vcpu->arch.regs[VCPU_REGS_RCX] = 0;
2619         vcpu->arch.regs[VCPU_REGS_RDX] = 0;
2620         best = NULL;
2621         for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
2622                 e = &vcpu->arch.cpuid_entries[i];
2623                 if (is_matching_cpuid_entry(e, function, index)) {
2624                         if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC)
2625                                 move_to_next_stateful_cpuid_entry(vcpu, i);
2626                         best = e;
2627                         break;
2628                 }
2629                 /*
2630                  * Both basic or both extended?
2631                  */
2632                 if (((e->function ^ function) & 0x80000000) == 0)
2633                         if (!best || e->function > best->function)
2634                                 best = e;
2635         }
2636         if (best) {
2637                 vcpu->arch.regs[VCPU_REGS_RAX] = best->eax;
2638                 vcpu->arch.regs[VCPU_REGS_RBX] = best->ebx;
2639                 vcpu->arch.regs[VCPU_REGS_RCX] = best->ecx;
2640                 vcpu->arch.regs[VCPU_REGS_RDX] = best->edx;
2641         }
2642         kvm_x86_ops->decache_regs(vcpu);
2643         kvm_x86_ops->skip_emulated_instruction(vcpu);
2644 }
2645 EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
2646
2647 /*
2648  * Check if userspace requested an interrupt window, and that the
2649  * interrupt window is open.
2650  *
2651  * No need to exit to userspace if we already have an interrupt queued.
2652  */
2653 static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu,
2654                                           struct kvm_run *kvm_run)
2655 {
2656         return (!vcpu->arch.irq_summary &&
2657                 kvm_run->request_interrupt_window &&
2658                 vcpu->arch.interrupt_window_open &&
2659                 (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF));
2660 }
2661
2662 static void post_kvm_run_save(struct kvm_vcpu *vcpu,
2663                               struct kvm_run *kvm_run)
2664 {
2665         kvm_run->if_flag = (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
2666         kvm_run->cr8 = kvm_get_cr8(vcpu);
2667         kvm_run->apic_base = kvm_get_apic_base(vcpu);
2668         if (irqchip_in_kernel(vcpu->kvm))
2669                 kvm_run->ready_for_interrupt_injection = 1;
2670         else
2671                 kvm_run->ready_for_interrupt_injection =
2672                                         (vcpu->arch.interrupt_window_open &&
2673                                          vcpu->arch.irq_summary == 0);
2674 }
2675
2676 static void vapic_enter(struct kvm_vcpu *vcpu)
2677 {
2678         struct kvm_lapic *apic = vcpu->arch.apic;
2679         struct page *page;
2680
2681         if (!apic || !apic->vapic_addr)
2682                 return;
2683
2684         down_read(&current->mm->mmap_sem);
2685         page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
2686         up_read(&current->mm->mmap_sem);
2687
2688         vcpu->arch.apic->vapic_page = page;
2689 }
2690
2691 static void vapic_exit(struct kvm_vcpu *vcpu)
2692 {
2693         struct kvm_lapic *apic = vcpu->arch.apic;
2694
2695         if (!apic || !apic->vapic_addr)
2696                 return;
2697
2698         kvm_release_page_dirty(apic->vapic_page);
2699         mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
2700 }
2701
2702 static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2703 {
2704         int r;
2705
2706         if (unlikely(vcpu->arch.mp_state == VCPU_MP_STATE_SIPI_RECEIVED)) {
2707                 pr_debug("vcpu %d received sipi with vector # %x\n",
2708                        vcpu->vcpu_id, vcpu->arch.sipi_vector);
2709                 kvm_lapic_reset(vcpu);
2710                 r = kvm_x86_ops->vcpu_reset(vcpu);
2711                 if (r)
2712                         return r;
2713                 vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE;
2714         }
2715
2716         vapic_enter(vcpu);
2717
2718 preempted:
2719         if (vcpu->guest_debug.enabled)
2720                 kvm_x86_ops->guest_debug_pre(vcpu);
2721
2722 again:
2723         if (vcpu->requests)
2724                 if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests))
2725                         kvm_mmu_unload(vcpu);
2726
2727         r = kvm_mmu_reload(vcpu);
2728         if (unlikely(r))
2729                 goto out;
2730
2731         if (vcpu->requests) {
2732                 if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests))
2733                         __kvm_migrate_apic_timer(vcpu);
2734                 if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS,
2735                                        &vcpu->requests)) {
2736                         kvm_run->exit_reason = KVM_EXIT_TPR_ACCESS;
2737                         r = 0;
2738                         goto out;
2739                 }
2740                 if (test_and_clear_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests)) {
2741                         kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
2742                         r = 0;
2743                         goto out;
2744                 }
2745         }
2746
2747         kvm_inject_pending_timer_irqs(vcpu);
2748
2749         preempt_disable();
2750
2751         kvm_x86_ops->prepare_guest_switch(vcpu);
2752         kvm_load_guest_fpu(vcpu);
2753
2754         local_irq_disable();
2755
2756         if (need_resched()) {
2757                 local_irq_enable();
2758                 preempt_enable();
2759                 r = 1;
2760                 goto out;
2761         }
2762
2763         if (vcpu->requests)
2764                 if (test_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) {
2765                         local_irq_enable();
2766                         preempt_enable();
2767                         r = 1;
2768                         goto out;
2769                 }
2770
2771         if (signal_pending(current)) {
2772                 local_irq_enable();
2773                 preempt_enable();
2774                 r = -EINTR;
2775                 kvm_run->exit_reason = KVM_EXIT_INTR;
2776                 ++vcpu->stat.signal_exits;
2777                 goto out;
2778         }
2779
2780         if (vcpu->arch.exception.pending)
2781                 __queue_exception(vcpu);
2782         else if (irqchip_in_kernel(vcpu->kvm))
2783                 kvm_x86_ops->inject_pending_irq(vcpu);
2784         else
2785                 kvm_x86_ops->inject_pending_vectors(vcpu, kvm_run);
2786
2787         kvm_lapic_sync_to_vapic(vcpu);
2788
2789         vcpu->guest_mode = 1;
2790         kvm_guest_enter();
2791
2792         if (vcpu->requests)
2793                 if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
2794                         kvm_x86_ops->tlb_flush(vcpu);
2795
2796         kvm_x86_ops->run(vcpu, kvm_run);
2797
2798         vcpu->guest_mode = 0;
2799         local_irq_enable();
2800
2801         ++vcpu->stat.exits;
2802
2803         /*
2804          * We must have an instruction between local_irq_enable() and
2805          * kvm_guest_exit(), so the timer interrupt isn't delayed by
2806          * the interrupt shadow.  The stat.exits increment will do nicely.
2807          * But we need to prevent reordering, hence this barrier():
2808          */
2809         barrier();
2810
2811         kvm_guest_exit();
2812
2813         preempt_enable();
2814
2815         /*
2816          * Profile KVM exit RIPs:
2817          */
2818         if (unlikely(prof_on == KVM_PROFILING)) {
2819                 kvm_x86_ops->cache_regs(vcpu);
2820                 profile_hit(KVM_PROFILING, (void *)vcpu->arch.rip);
2821         }
2822
2823         if (vcpu->arch.exception.pending && kvm_x86_ops->exception_injected(vcpu))
2824                 vcpu->arch.exception.pending = false;
2825
2826         kvm_lapic_sync_from_vapic(vcpu);
2827
2828         r = kvm_x86_ops->handle_exit(kvm_run, vcpu);
2829
2830         if (r > 0) {
2831                 if (dm_request_for_irq_injection(vcpu, kvm_run)) {
2832                         r = -EINTR;
2833                         kvm_run->exit_reason = KVM_EXIT_INTR;
2834                         ++vcpu->stat.request_irq_exits;
2835                         goto out;
2836                 }
2837                 if (!need_resched())
2838                         goto again;
2839         }
2840
2841 out:
2842         if (r > 0) {
2843                 kvm_resched(vcpu);
2844                 goto preempted;
2845         }
2846
2847         post_kvm_run_save(vcpu, kvm_run);
2848
2849         vapic_exit(vcpu);
2850
2851         return r;
2852 }
2853
2854 int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2855 {
2856         int r;
2857         sigset_t sigsaved;
2858
2859         vcpu_load(vcpu);
2860
2861         if (unlikely(vcpu->arch.mp_state == VCPU_MP_STATE_UNINITIALIZED)) {
2862                 kvm_vcpu_block(vcpu);
2863                 vcpu_put(vcpu);
2864                 return -EAGAIN;
2865         }
2866
2867         if (vcpu->sigset_active)
2868                 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
2869
2870         /* re-sync apic's tpr */
2871         if (!irqchip_in_kernel(vcpu->kvm))
2872                 kvm_set_cr8(vcpu, kvm_run->cr8);
2873
2874         if (vcpu->arch.pio.cur_count) {
2875                 r = complete_pio(vcpu);
2876                 if (r)
2877                         goto out;
2878         }
2879 #if CONFIG_HAS_IOMEM
2880         if (vcpu->mmio_needed) {
2881                 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
2882                 vcpu->mmio_read_completed = 1;
2883                 vcpu->mmio_needed = 0;
2884                 r = emulate_instruction(vcpu, kvm_run,
2885                                         vcpu->arch.mmio_fault_cr2, 0,
2886                                         EMULTYPE_NO_DECODE);
2887                 if (r == EMULATE_DO_MMIO) {
2888                         /*
2889                          * Read-modify-write.  Back to userspace.
2890                          */
2891                         r = 0;
2892                         goto out;
2893                 }
2894         }
2895 #endif
2896         if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) {
2897                 kvm_x86_ops->cache_regs(vcpu);
2898                 vcpu->arch.regs[VCPU_REGS_RAX] = kvm_run->hypercall.ret;
2899                 kvm_x86_ops->decache_regs(vcpu);
2900         }
2901
2902         r = __vcpu_run(vcpu, kvm_run);
2903
2904 out:
2905         if (vcpu->sigset_active)
2906                 sigprocmask(SIG_SETMASK, &sigsaved, NULL);
2907
2908         vcpu_put(vcpu);
2909         return r;
2910 }
2911
2912 int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
2913 {
2914         vcpu_load(vcpu);
2915
2916         kvm_x86_ops->cache_regs(vcpu);
2917
2918         regs->rax = vcpu->arch.regs[VCPU_REGS_RAX];
2919         regs->rbx = vcpu->arch.regs[VCPU_REGS_RBX];
2920         regs->rcx = vcpu->arch.regs[VCPU_REGS_RCX];
2921         regs->rdx = vcpu->arch.regs[VCPU_REGS_RDX];
2922         regs->rsi = vcpu->arch.regs[VCPU_REGS_RSI];
2923         regs->rdi = vcpu->arch.regs[VCPU_REGS_RDI];
2924         regs->rsp = vcpu->arch.regs[VCPU_REGS_RSP];
2925         regs->rbp = vcpu->arch.regs[VCPU_REGS_RBP];
2926 #ifdef CONFIG_X86_64
2927         regs->r8 = vcpu->arch.regs[VCPU_REGS_R8];
2928         regs->r9 = vcpu->arch.regs[VCPU_REGS_R9];
2929         regs->r10 = vcpu->arch.regs[VCPU_REGS_R10];
2930         regs->r11 = vcpu->arch.regs[VCPU_REGS_R11];
2931         regs->r12 = vcpu->arch.regs[VCPU_REGS_R12];
2932         regs->r13 = vcpu->arch.regs[VCPU_REGS_R13];
2933         regs->r14 = vcpu->arch.regs[VCPU_REGS_R14];
2934         regs->r15 = vcpu->arch.regs[VCPU_REGS_R15];
2935 #endif
2936
2937         regs->rip = vcpu->arch.rip;
2938         regs->rflags = kvm_x86_ops->get_rflags(vcpu);
2939
2940         /*
2941          * Don't leak debug flags in case they were set for guest debugging
2942          */
2943         if (vcpu->guest_debug.enabled && vcpu->guest_debug.singlestep)
2944                 regs->rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
2945
2946         vcpu_put(vcpu);
2947
2948         return 0;
2949 }
2950
2951 int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
2952 {
2953         vcpu_load(vcpu);
2954
2955         vcpu->arch.regs[VCPU_REGS_RAX] = regs->rax;
2956         vcpu->arch.regs[VCPU_REGS_RBX] = regs->rbx;
2957         vcpu->arch.regs[VCPU_REGS_RCX] = regs->rcx;
2958         vcpu->arch.regs[VCPU_REGS_RDX] = regs->rdx;
2959         vcpu->arch.regs[VCPU_REGS_RSI] = regs->rsi;
2960         vcpu->arch.regs[VCPU_REGS_RDI] = regs->rdi;
2961         vcpu->arch.regs[VCPU_REGS_RSP] = regs->rsp;
2962         vcpu->arch.regs[VCPU_REGS_RBP] = regs->rbp;
2963 #ifdef CONFIG_X86_64
2964         vcpu->arch.regs[VCPU_REGS_R8] = regs->r8;
2965         vcpu->arch.regs[VCPU_REGS_R9] = regs->r9;
2966         vcpu->arch.regs[VCPU_REGS_R10] = regs->r10;
2967         vcpu->arch.regs[VCPU_REGS_R11] = regs->r11;
2968         vcpu->arch.regs[VCPU_REGS_R12] = regs->r12;
2969         vcpu->arch.regs[VCPU_REGS_R13] = regs->r13;
2970         vcpu->arch.regs[VCPU_REGS_R14] = regs->r14;
2971         vcpu->arch.regs[VCPU_REGS_R15] = regs->r15;
2972 #endif
2973
2974         vcpu->arch.rip = regs->rip;
2975         kvm_x86_ops->set_rflags(vcpu, regs->rflags);
2976
2977         kvm_x86_ops->decache_regs(vcpu);
2978
2979         vcpu_put(vcpu);
2980
2981         return 0;
2982 }
2983
2984 static void get_segment(struct kvm_vcpu *vcpu,
2985                         struct kvm_segment *var, int seg)
2986 {
2987         kvm_x86_ops->get_segment(vcpu, var, seg);
2988 }
2989
2990 void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
2991 {
2992         struct kvm_segment cs;
2993
2994         get_segment(vcpu, &cs, VCPU_SREG_CS);
2995         *db = cs.db;
2996         *l = cs.l;
2997 }
2998 EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits);
2999
3000 int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
3001                                   struct kvm_sregs *sregs)
3002 {
3003         struct descriptor_table dt;
3004         int pending_vec;
3005
3006         vcpu_load(vcpu);
3007
3008         get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
3009         get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
3010         get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
3011         get_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
3012         get_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
3013         get_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
3014
3015         get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
3016         get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
3017
3018         kvm_x86_ops->get_idt(vcpu, &dt);
3019         sregs->idt.limit = dt.limit;
3020         sregs->idt.base = dt.base;
3021         kvm_x86_ops->get_gdt(vcpu, &dt);
3022         sregs->gdt.limit = dt.limit;
3023         sregs->gdt.base = dt.base;
3024
3025         kvm_x86_ops->decache_cr4_guest_bits(vcpu);
3026         sregs->cr0 = vcpu->arch.cr0;
3027         sregs->cr2 = vcpu->arch.cr2;
3028         sregs->cr3 = vcpu->arch.cr3;
3029         sregs->cr4 = vcpu->arch.cr4;
3030         sregs->cr8 = kvm_get_cr8(vcpu);
3031         sregs->efer = vcpu->arch.shadow_efer;
3032         sregs->apic_base = kvm_get_apic_base(vcpu);
3033
3034         if (irqchip_in_kernel(vcpu->kvm)) {
3035                 memset(sregs->interrupt_bitmap, 0,
3036                        sizeof sregs->interrupt_bitmap);
3037                 pending_vec = kvm_x86_ops->get_irq(vcpu);
3038                 if (pending_vec >= 0)
3039                         set_bit(pending_vec,
3040                                 (unsigned long *)sregs->interrupt_bitmap);
3041         } else
3042                 memcpy(sregs->interrupt_bitmap, vcpu->arch.irq_pending,
3043                        sizeof sregs->interrupt_bitmap);
3044
3045         vcpu_put(vcpu);
3046
3047         return 0;
3048 }
3049
3050 static void set_segment(struct kvm_vcpu *vcpu,
3051                         struct kvm_segment *var, int seg)
3052 {
3053         kvm_x86_ops->set_segment(vcpu, var, seg);
3054 }
3055
3056 int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
3057                                   struct kvm_sregs *sregs)
3058 {
3059         int mmu_reset_needed = 0;
3060         int i, pending_vec, max_bits;
3061         struct descriptor_table dt;
3062
3063         vcpu_load(vcpu);
3064
3065         dt.limit = sregs->idt.limit;
3066         dt.base = sregs->idt.base;
3067         kvm_x86_ops->set_idt(vcpu, &dt);
3068         dt.limit = sregs->gdt.limit;
3069         dt.base = sregs->gdt.base;
3070         kvm_x86_ops->set_gdt(vcpu, &dt);
3071
3072         vcpu->arch.cr2 = sregs->cr2;
3073         mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3;
3074         vcpu->arch.cr3 = sregs->cr3;
3075
3076         kvm_set_cr8(vcpu, sregs->cr8);
3077
3078         mmu_reset_needed |= vcpu->arch.shadow_efer != sregs->efer;
3079         kvm_x86_ops->set_efer(vcpu, sregs->efer);
3080         kvm_set_apic_base(vcpu, sregs->apic_base);
3081
3082         kvm_x86_ops->decache_cr4_guest_bits(vcpu);
3083
3084         mmu_reset_needed |= vcpu->arch.cr0 != sregs->cr0;
3085         kvm_x86_ops->set_cr0(vcpu, sregs->cr0);
3086         vcpu->arch.cr0 = sregs->cr0;
3087
3088         mmu_reset_needed |= vcpu->arch.cr4 != sregs->cr4;
3089         kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
3090         if (!is_long_mode(vcpu) && is_pae(vcpu))
3091                 load_pdptrs(vcpu, vcpu->arch.cr3);
3092
3093         if (mmu_reset_needed)
3094                 kvm_mmu_reset_context(vcpu);
3095
3096         if (!irqchip_in_kernel(vcpu->kvm)) {
3097                 memcpy(vcpu->arch.irq_pending, sregs->interrupt_bitmap,
3098                        sizeof vcpu->arch.irq_pending);
3099                 vcpu->arch.irq_summary = 0;
3100                 for (i = 0; i < ARRAY_SIZE(vcpu->arch.irq_pending); ++i)
3101                         if (vcpu->arch.irq_pending[i])
3102                                 __set_bit(i, &vcpu->arch.irq_summary);
3103         } else {
3104                 max_bits = (sizeof sregs->interrupt_bitmap) << 3;
3105                 pending_vec = find_first_bit(
3106                         (const unsigned long *)sregs->interrupt_bitmap,
3107                         max_bits);
3108                 /* Only pending external irq is handled here */
3109                 if (pending_vec < max_bits) {
3110                         kvm_x86_ops->set_irq(vcpu, pending_vec);
3111                         pr_debug("Set back pending irq %d\n",
3112                                  pending_vec);
3113                 }
3114         }
3115
3116         set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
3117         set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
3118         set_segment(vcpu, &sregs->es, VCPU_SREG_ES);
3119         set_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
3120         set_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
3121         set_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
3122
3123         set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
3124         set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
3125
3126         vcpu_put(vcpu);
3127
3128         return 0;
3129 }
3130
3131 int kvm_arch_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu,
3132                                     struct kvm_debug_guest *dbg)
3133 {
3134         int r;
3135
3136         vcpu_load(vcpu);
3137
3138         r = kvm_x86_ops->set_guest_debug(vcpu, dbg);
3139
3140         vcpu_put(vcpu);
3141
3142         return r;
3143 }
3144
3145 /*
3146  * fxsave fpu state.  Taken from x86_64/processor.h.  To be killed when
3147  * we have asm/x86/processor.h
3148  */
3149 struct fxsave {
3150         u16     cwd;
3151         u16     swd;
3152         u16     twd;
3153         u16     fop;
3154         u64     rip;
3155         u64     rdp;
3156         u32     mxcsr;
3157         u32     mxcsr_mask;
3158         u32     st_space[32];   /* 8*16 bytes for each FP-reg = 128 bytes */
3159 #ifdef CONFIG_X86_64
3160         u32     xmm_space[64];  /* 16*16 bytes for each XMM-reg = 256 bytes */
3161 #else
3162         u32     xmm_space[32];  /* 8*16 bytes for each XMM-reg = 128 bytes */
3163 #endif
3164 };
3165
3166 /*
3167  * Translate a guest virtual address to a guest physical address.
3168  */
3169 int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
3170                                     struct kvm_translation *tr)
3171 {
3172         unsigned long vaddr = tr->linear_address;
3173         gpa_t gpa;
3174
3175         vcpu_load(vcpu);
3176         down_read(&vcpu->kvm->slots_lock);
3177         gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, vaddr);
3178         up_read(&vcpu->kvm->slots_lock);
3179         tr->physical_address = gpa;
3180         tr->valid = gpa != UNMAPPED_GVA;
3181         tr->writeable = 1;
3182         tr->usermode = 0;
3183         vcpu_put(vcpu);
3184
3185         return 0;
3186 }
3187
3188 int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
3189 {
3190         struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image;
3191
3192         vcpu_load(vcpu);
3193
3194         memcpy(fpu->fpr, fxsave->st_space, 128);
3195         fpu->fcw = fxsave->cwd;
3196         fpu->fsw = fxsave->swd;
3197         fpu->ftwx = fxsave->twd;
3198         fpu->last_opcode = fxsave->fop;
3199         fpu->last_ip = fxsave->rip;
3200         fpu->last_dp = fxsave->rdp;
3201         memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space);
3202
3203         vcpu_put(vcpu);
3204
3205         return 0;
3206 }
3207
3208 int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
3209 {
3210         struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image;
3211
3212         vcpu_load(vcpu);
3213
3214         memcpy(fxsave->st_space, fpu->fpr, 128);
3215         fxsave->cwd = fpu->fcw;
3216         fxsave->swd = fpu->fsw;
3217         fxsave->twd = fpu->ftwx;
3218         fxsave->fop = fpu->last_opcode;
3219         fxsave->rip = fpu->last_ip;
3220         fxsave->rdp = fpu->last_dp;
3221         memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space);
3222
3223         vcpu_put(vcpu);
3224
3225         return 0;
3226 }
3227
3228 void fx_init(struct kvm_vcpu *vcpu)
3229 {
3230         unsigned after_mxcsr_mask;
3231
3232         /* Initialize guest FPU by resetting ours and saving into guest's */
3233         preempt_disable();
3234         fx_save(&vcpu->arch.host_fx_image);
3235         fpu_init();
3236         fx_save(&vcpu->arch.guest_fx_image);
3237         fx_restore(&vcpu->arch.host_fx_image);
3238         preempt_enable();
3239
3240         vcpu->arch.cr0 |= X86_CR0_ET;
3241         after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space);
3242         vcpu->arch.guest_fx_image.mxcsr = 0x1f80;
3243         memset((void *)&vcpu->arch.guest_fx_image + after_mxcsr_mask,
3244                0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask);
3245 }
3246 EXPORT_SYMBOL_GPL(fx_init);
3247
3248 void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
3249 {
3250         if (!vcpu->fpu_active || vcpu->guest_fpu_loaded)
3251                 return;
3252
3253         vcpu->guest_fpu_loaded = 1;
3254         fx_save(&vcpu->arch.host_fx_image);
3255         fx_restore(&vcpu->arch.guest_fx_image);
3256 }
3257 EXPORT_SYMBOL_GPL(kvm_load_guest_fpu);
3258
3259 void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
3260 {
3261         if (!vcpu->guest_fpu_loaded)
3262                 return;
3263
3264         vcpu->guest_fpu_loaded = 0;
3265         fx_save(&vcpu->arch.guest_fx_image);
3266         fx_restore(&vcpu->arch.host_fx_image);
3267         ++vcpu->stat.fpu_reload;
3268 }
3269 EXPORT_SYMBOL_GPL(kvm_put_guest_fpu);
3270
3271 void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
3272 {
3273         kvm_x86_ops->vcpu_free(vcpu);
3274 }
3275
3276 struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
3277                                                 unsigned int id)
3278 {
3279         return kvm_x86_ops->vcpu_create(kvm, id);
3280 }
3281
3282 int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
3283 {
3284         int r;
3285
3286         /* We do fxsave: this must be aligned. */
3287         BUG_ON((unsigned long)&vcpu->arch.host_fx_image & 0xF);
3288
3289         vcpu_load(vcpu);
3290         r = kvm_arch_vcpu_reset(vcpu);
3291         if (r == 0)
3292                 r = kvm_mmu_setup(vcpu);
3293         vcpu_put(vcpu);
3294         if (r < 0)
3295                 goto free_vcpu;
3296
3297         return 0;
3298 free_vcpu:
3299         kvm_x86_ops->vcpu_free(vcpu);
3300         return r;
3301 }
3302
3303 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
3304 {
3305         vcpu_load(vcpu);
3306         kvm_mmu_unload(vcpu);
3307         vcpu_put(vcpu);
3308
3309         kvm_x86_ops->vcpu_free(vcpu);
3310 }
3311
3312 int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
3313 {
3314         return kvm_x86_ops->vcpu_reset(vcpu);
3315 }
3316
3317 void kvm_arch_hardware_enable(void *garbage)
3318 {
3319         kvm_x86_ops->hardware_enable(garbage);
3320 }
3321
3322 void kvm_arch_hardware_disable(void *garbage)
3323 {
3324         kvm_x86_ops->hardware_disable(garbage);
3325 }
3326
3327 int kvm_arch_hardware_setup(void)
3328 {
3329         return kvm_x86_ops->hardware_setup();
3330 }
3331
3332 void kvm_arch_hardware_unsetup(void)
3333 {
3334         kvm_x86_ops->hardware_unsetup();
3335 }
3336
3337 void kvm_arch_check_processor_compat(void *rtn)
3338 {
3339         kvm_x86_ops->check_processor_compatibility(rtn);
3340 }
3341
3342 int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
3343 {
3344         struct page *page;
3345         struct kvm *kvm;
3346         int r;
3347
3348         BUG_ON(vcpu->kvm == NULL);
3349         kvm = vcpu->kvm;
3350
3351         vcpu->arch.mmu.root_hpa = INVALID_PAGE;
3352         if (!irqchip_in_kernel(kvm) || vcpu->vcpu_id == 0)
3353                 vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE;
3354         else
3355                 vcpu->arch.mp_state = VCPU_MP_STATE_UNINITIALIZED;
3356
3357         page = alloc_page(GFP_KERNEL | __GFP_ZERO);
3358         if (!page) {
3359                 r = -ENOMEM;
3360                 goto fail;
3361         }
3362         vcpu->arch.pio_data = page_address(page);
3363
3364         r = kvm_mmu_create(vcpu);
3365         if (r < 0)
3366                 goto fail_free_pio_data;
3367
3368         if (irqchip_in_kernel(kvm)) {
3369                 r = kvm_create_lapic(vcpu);
3370                 if (r < 0)
3371                         goto fail_mmu_destroy;
3372         }
3373
3374         return 0;
3375
3376 fail_mmu_destroy:
3377         kvm_mmu_destroy(vcpu);
3378 fail_free_pio_data:
3379         free_page((unsigned long)vcpu->arch.pio_data);
3380 fail:
3381         return r;
3382 }
3383
3384 void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
3385 {
3386         kvm_free_lapic(vcpu);
3387         kvm_mmu_destroy(vcpu);
3388         free_page((unsigned long)vcpu->arch.pio_data);
3389 }
3390
3391 struct  kvm *kvm_arch_create_vm(void)
3392 {
3393         struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
3394
3395         if (!kvm)
3396                 return ERR_PTR(-ENOMEM);
3397
3398         INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
3399
3400         return kvm;
3401 }
3402
3403 static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
3404 {
3405         vcpu_load(vcpu);
3406         kvm_mmu_unload(vcpu);
3407         vcpu_put(vcpu);
3408 }
3409
3410 static void kvm_free_vcpus(struct kvm *kvm)
3411 {
3412         unsigned int i;
3413
3414         /*
3415          * Unpin any mmu pages first.
3416          */
3417         for (i = 0; i < KVM_MAX_VCPUS; ++i)
3418                 if (kvm->vcpus[i])
3419                         kvm_unload_vcpu_mmu(kvm->vcpus[i]);
3420         for (i = 0; i < KVM_MAX_VCPUS; ++i) {
3421                 if (kvm->vcpus[i]) {
3422                         kvm_arch_vcpu_free(kvm->vcpus[i]);
3423                         kvm->vcpus[i] = NULL;
3424                 }
3425         }
3426
3427 }
3428
3429 void kvm_arch_destroy_vm(struct kvm *kvm)
3430 {
3431         kvm_free_pit(kvm);
3432         kfree(kvm->arch.vpic);
3433         kfree(kvm->arch.vioapic);
3434         kvm_free_vcpus(kvm);
3435         kvm_free_physmem(kvm);
3436         kfree(kvm);
3437 }
3438
3439 int kvm_arch_set_memory_region(struct kvm *kvm,
3440                                 struct kvm_userspace_memory_region *mem,
3441                                 struct kvm_memory_slot old,
3442                                 int user_alloc)
3443 {
3444         int npages = mem->memory_size >> PAGE_SHIFT;
3445         struct kvm_memory_slot *memslot = &kvm->memslots[mem->slot];
3446
3447         /*To keep backward compatibility with older userspace,
3448          *x86 needs to hanlde !user_alloc case.
3449          */
3450         if (!user_alloc) {
3451                 if (npages && !old.rmap) {
3452                         down_write(&current->mm->mmap_sem);
3453                         memslot->userspace_addr = do_mmap(NULL, 0,
3454                                                      npages * PAGE_SIZE,
3455                                                      PROT_READ | PROT_WRITE,
3456                                                      MAP_SHARED | MAP_ANONYMOUS,
3457                                                      0);
3458                         up_write(&current->mm->mmap_sem);
3459
3460                         if (IS_ERR((void *)memslot->userspace_addr))
3461                                 return PTR_ERR((void *)memslot->userspace_addr);
3462                 } else {
3463                         if (!old.user_alloc && old.rmap) {
3464                                 int ret;
3465
3466                                 down_write(&current->mm->mmap_sem);
3467                                 ret = do_munmap(current->mm, old.userspace_addr,
3468                                                 old.npages * PAGE_SIZE);
3469                                 up_write(&current->mm->mmap_sem);
3470                                 if (ret < 0)
3471                                         printk(KERN_WARNING
3472                                        "kvm_vm_ioctl_set_memory_region: "
3473                                        "failed to munmap memory\n");
3474                         }
3475                 }
3476         }
3477
3478         if (!kvm->arch.n_requested_mmu_pages) {
3479                 unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
3480                 kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
3481         }
3482
3483         kvm_mmu_slot_remove_write_access(kvm, mem->slot);
3484         kvm_flush_remote_tlbs(kvm);
3485
3486         return 0;
3487 }
3488
3489 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
3490 {
3491         return vcpu->arch.mp_state == VCPU_MP_STATE_RUNNABLE
3492                || vcpu->arch.mp_state == VCPU_MP_STATE_SIPI_RECEIVED;
3493 }
3494
3495 static void vcpu_kick_intr(void *info)
3496 {
3497 #ifdef DEBUG
3498         struct kvm_vcpu *vcpu = (struct kvm_vcpu *)info;
3499         printk(KERN_DEBUG "vcpu_kick_intr %p \n", vcpu);
3500 #endif
3501 }
3502
3503 void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
3504 {
3505         int ipi_pcpu = vcpu->cpu;
3506
3507         if (waitqueue_active(&vcpu->wq)) {
3508                 wake_up_interruptible(&vcpu->wq);
3509                 ++vcpu->stat.halt_wakeup;
3510         }
3511         if (vcpu->guest_mode)
3512                 smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0, 0);
3513 }