arch/x86/kvm/x86.c

   1 /*
   2  * Kernel-based Virtual Machine driver for Linux
   3  *
   4  * derived from drivers/kvm/kvm_main.c
   5  *
   6  * Copyright (C) 2006 Qumranet, Inc.
   7  * Copyright (C) 2008 Qumranet, Inc.
   8  * Copyright IBM Corporation, 2008
   9  *
  10  * Authors:
  11  *   Avi Kivity   <avi@qumranet.com>
  12  *   Yaniv Kamay  <yaniv@qumranet.com>
  13  *   Amit Shah    <amit.shah@qumranet.com>
  14  *   Ben-Ami Yassour <benami@il.ibm.com>
  15  *
  16  * This work is licensed under the terms of the GNU GPL, version 2.  See
  17  * the COPYING file in the top-level directory.
  18  *
  19  */
  20
  21 #include <linux/kvm_host.h>
  22 #include "irq.h"
  23 #include "mmu.h"
  24 #include "i8254.h"
  25 #include "tss.h"
  26 #include "kvm_cache_regs.h"
  27 #include "x86.h"
  28
  29 #include <linux/clocksource.h>
  30 #include <linux/interrupt.h>
  31 #include <linux/kvm.h>
  32 #include <linux/fs.h>
  33 #include <linux/vmalloc.h>
  34 #include <linux/module.h>
  35 #include <linux/mman.h>
  36 #include <linux/highmem.h>
  37 #include <linux/iommu.h>
  38 #include <linux/intel-iommu.h>
  39 #include <linux/cpufreq.h>
  40
  41 #include <asm/uaccess.h>
  42 #include <asm/msr.h>
  43 #include <asm/desc.h>
  44 #include <asm/mtrr.h>
  45
  46 #define MAX_IO_MSRS 256
  47 #define CR0_RESERVED_BITS                                               \
  48         (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
  49                           | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
  50                           | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG))
  51 #define CR4_RESERVED_BITS                                               \
  52         (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
  53                           | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE     \
  54                           | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR  \
  55                           | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
  56
  57 #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
  58 /* EFER defaults:
  59  * - enable syscall per default because its emulated by KVM
  60  * - enable LME and LMA per default on 64 bit KVM
  61  */
  62 #ifdef CONFIG_X86_64
  63 static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffafeULL;
  64 #else
  65 static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffffeULL;
  66 #endif
  67
  68 #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
  69 #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
  70
  71 static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
  72                                     struct kvm_cpuid_entry2 __user *entries);
  73 struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
  74                                               u32 function, u32 index);
  75
  76 struct kvm_x86_ops *kvm_x86_ops;
  77 EXPORT_SYMBOL_GPL(kvm_x86_ops);
  78
  79 struct kvm_stats_debugfs_item debugfs_entries[] = {
  80         { "pf_fixed", VCPU_STAT(pf_fixed) },
  81         { "pf_guest", VCPU_STAT(pf_guest) },
  82         { "tlb_flush", VCPU_STAT(tlb_flush) },
  83         { "invlpg", VCPU_STAT(invlpg) },
  84         { "exits", VCPU_STAT(exits) },
  85         { "io_exits", VCPU_STAT(io_exits) },
  86         { "mmio_exits", VCPU_STAT(mmio_exits) },
  87         { "signal_exits", VCPU_STAT(signal_exits) },
  88         { "irq_window", VCPU_STAT(irq_window_exits) },
  89         { "nmi_window", VCPU_STAT(nmi_window_exits) },
  90         { "halt_exits", VCPU_STAT(halt_exits) },
  91         { "halt_wakeup", VCPU_STAT(halt_wakeup) },
  92         { "hypercalls", VCPU_STAT(hypercalls) },
  93         { "request_irq", VCPU_STAT(request_irq_exits) },
  94         { "request_nmi", VCPU_STAT(request_nmi_exits) },
  95         { "irq_exits", VCPU_STAT(irq_exits) },
  96         { "host_state_reload", VCPU_STAT(host_state_reload) },
  97         { "efer_reload", VCPU_STAT(efer_reload) },
  98         { "fpu_reload", VCPU_STAT(fpu_reload) },
  99         { "insn_emulation", VCPU_STAT(insn_emulation) },
 100         { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
 101         { "irq_injections", VCPU_STAT(irq_injections) },
 102         { "nmi_injections", VCPU_STAT(nmi_injections) },
 103         { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
 104         { "mmu_pte_write", VM_STAT(mmu_pte_write) },
 105         { "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
 106         { "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) },
 107         { "mmu_flooded", VM_STAT(mmu_flooded) },
 108         { "mmu_recycled", VM_STAT(mmu_recycled) },
 109         { "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
 110         { "mmu_unsync", VM_STAT(mmu_unsync) },
 111         { "mmu_unsync_global", VM_STAT(mmu_unsync_global) },
 112         { "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
 113         { "largepages", VM_STAT(lpages) },
 114         { NULL }
 115 };
 116
 117 unsigned long segment_base(u16 selector)
 118 {
 119         struct descriptor_table gdt;
 120         struct desc_struct *d;
 121         unsigned long table_base;
 122         unsigned long v;
 123
 124         if (selector == 0)
 125                 return 0;
 126
 127         asm("sgdt %0" : "=m"(gdt));
 128         table_base = gdt.base;
 129
 130         if (selector & 4) {           /* from ldt */
 131                 u16 ldt_selector;
 132
 133                 asm("sldt %0" : "=g"(ldt_selector));
 134                 table_base = segment_base(ldt_selector);
 135         }
 136         d = (struct desc_struct *)(table_base + (selector & ~7));
 137         v = d->base0 | ((unsigned long)d->base1 << 16) |
 138                 ((unsigned long)d->base2 << 24);
 139 #ifdef CONFIG_X86_64
 140         if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
 141                 v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32;
 142 #endif
 143         return v;
 144 }
 145 EXPORT_SYMBOL_GPL(segment_base);
 146
 147 u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
 148 {
 149         if (irqchip_in_kernel(vcpu->kvm))
 150                 return vcpu->arch.apic_base;
 151         else
 152                 return vcpu->arch.apic_base;
 153 }
 154 EXPORT_SYMBOL_GPL(kvm_get_apic_base);
 155
 156 void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data)
 157 {
 158         /* TODO: reserve bits check */
 159         if (irqchip_in_kernel(vcpu->kvm))
 160                 kvm_lapic_set_base(vcpu, data);
 161         else
 162                 vcpu->arch.apic_base = data;
 163 }
 164 EXPORT_SYMBOL_GPL(kvm_set_apic_base);
 165
 166 void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
 167 {
 168         WARN_ON(vcpu->arch.exception.pending);
 169         vcpu->arch.exception.pending = true;
 170         vcpu->arch.exception.has_error_code = false;
 171         vcpu->arch.exception.nr = nr;
 172 }
 173 EXPORT_SYMBOL_GPL(kvm_queue_exception);
 174
 175 void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr,
 176                            u32 error_code)
 177 {
 178         ++vcpu->stat.pf_guest;
 179
 180         if (vcpu->arch.exception.pending) {
 181                 if (vcpu->arch.exception.nr == PF_VECTOR) {
 182                         printk(KERN_DEBUG "kvm: inject_page_fault:"
 183                                         " double fault 0x%lx\n", addr);
 184                         vcpu->arch.exception.nr = DF_VECTOR;
 185                         vcpu->arch.exception.error_code = 0;
 186                 } else if (vcpu->arch.exception.nr == DF_VECTOR) {
 187                         /* triple fault -> shutdown */
 188                         set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
 189                 }
 190                 return;
 191         }
 192         vcpu->arch.cr2 = addr;
 193         kvm_queue_exception_e(vcpu, PF_VECTOR, error_code);
 194 }
 195
 196 void kvm_inject_nmi(struct kvm_vcpu *vcpu)
 197 {
 198         vcpu->arch.nmi_pending = 1;
 199 }
 200 EXPORT_SYMBOL_GPL(kvm_inject_nmi);
 201
 202 void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
 203 {
 204         WARN_ON(vcpu->arch.exception.pending);
 205         vcpu->arch.exception.pending = true;
 206         vcpu->arch.exception.has_error_code = true;
 207         vcpu->arch.exception.nr = nr;
 208         vcpu->arch.exception.error_code = error_code;
 209 }
 210 EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
 211
 212 static void __queue_exception(struct kvm_vcpu *vcpu)
 213 {
 214         kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr,
 215                                      vcpu->arch.exception.has_error_code,
 216                                      vcpu->arch.exception.error_code);
 217 }
 218
 219 /*
 220  * Load the pae pdptrs.  Return true is they are all valid.
 221  */
 222 int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
 223 {
 224         gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
 225         unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
 226         int i;
 227         int ret;
 228         u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
 229
 230         ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte,
 231                                   offset * sizeof(u64), sizeof(pdpte));
 232         if (ret < 0) {
 233                 ret = 0;
 234                 goto out;
 235         }
 236         for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
 237                 if ((pdpte[i] & 1) && (pdpte[i] & 0xfffffff0000001e6ull)) {
 238                         ret = 0;
 239                         goto out;
 240                 }
 241         }
 242         ret = 1;
 243
 244         memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs));
 245 out:
 246
 247         return ret;
 248 }
 249 EXPORT_SYMBOL_GPL(load_pdptrs);
 250
 251 static bool pdptrs_changed(struct kvm_vcpu *vcpu)
 252 {
 253         u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
 254         bool changed = true;
 255         int r;
 256
 257         if (is_long_mode(vcpu) || !is_pae(vcpu))
 258                 return false;
 259
 260         r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte));
 261         if (r < 0)
 262                 goto out;
 263         changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0;
 264 out:
 265
 266         return changed;
 267 }
 268
 269 void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 270 {
 271         if (cr0 & CR0_RESERVED_BITS) {
 272                 printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n",
 273                        cr0, vcpu->arch.cr0);
 274                 kvm_inject_gp(vcpu, 0);
 275                 return;
 276         }
 277
 278         if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) {
 279                 printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n");
 280                 kvm_inject_gp(vcpu, 0);
 281                 return;
 282         }
 283
 284         if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) {
 285                 printk(KERN_DEBUG "set_cr0: #GP, set PG flag "
 286                        "and a clear PE flag\n");
 287                 kvm_inject_gp(vcpu, 0);
 288                 return;
 289         }
 290
 291         if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
 292 #ifdef CONFIG_X86_64
 293                 if ((vcpu->arch.shadow_efer & EFER_LME)) {
 294                         int cs_db, cs_l;
 295
 296                         if (!is_pae(vcpu)) {
 297                                 printk(KERN_DEBUG "set_cr0: #GP, start paging "
 298                                        "in long mode while PAE is disabled\n");
 299                                 kvm_inject_gp(vcpu, 0);
 300                                 return;
 301                         }
 302                         kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
 303                         if (cs_l) {
 304                                 printk(KERN_DEBUG "set_cr0: #GP, start paging "
 305                                        "in long mode while CS.L == 1\n");
 306                                 kvm_inject_gp(vcpu, 0);
 307                                 return;
 308
 309                         }
 310                 } else
 311 #endif
 312                 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
 313                         printk(KERN_DEBUG "set_cr0: #GP, pdptrs "
 314                                "reserved bits\n");
 315                         kvm_inject_gp(vcpu, 0);
 316                         return;
 317                 }
 318
 319         }
 320
 321         kvm_x86_ops->set_cr0(vcpu, cr0);
 322         vcpu->arch.cr0 = cr0;
 323
 324         kvm_mmu_sync_global(vcpu);
 325         kvm_mmu_reset_context(vcpu);
 326         return;
 327 }
 328 EXPORT_SYMBOL_GPL(kvm_set_cr0);
 329
 330 void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
 331 {
 332         kvm_set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f));
 333         KVMTRACE_1D(LMSW, vcpu,
 334                     (u32)((vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f)),
 335                     handler);
 336 }
 337 EXPORT_SYMBOL_GPL(kvm_lmsw);
 338
 339 void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 340 {
 341         unsigned long old_cr4 = vcpu->arch.cr4;
 342         unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE;
 343
 344         if (cr4 & CR4_RESERVED_BITS) {
 345                 printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n");
 346                 kvm_inject_gp(vcpu, 0);
 347                 return;
 348         }
 349
 350         if (is_long_mode(vcpu)) {
 351                 if (!(cr4 & X86_CR4_PAE)) {
 352                         printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while "
 353                                "in long mode\n");
 354                         kvm_inject_gp(vcpu, 0);
 355                         return;
 356                 }
 357         } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
 358                    && ((cr4 ^ old_cr4) & pdptr_bits)
 359                    && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
 360                 printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n");
 361                 kvm_inject_gp(vcpu, 0);
 362                 return;
 363         }
 364
 365         if (cr4 & X86_CR4_VMXE) {
 366                 printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n");
 367                 kvm_inject_gp(vcpu, 0);
 368                 return;
 369         }
 370         kvm_x86_ops->set_cr4(vcpu, cr4);
 371         vcpu->arch.cr4 = cr4;
 372         vcpu->arch.mmu.base_role.cr4_pge = (cr4 & X86_CR4_PGE) && !tdp_enabled;
 373         kvm_mmu_sync_global(vcpu);
 374         kvm_mmu_reset_context(vcpu);
 375 }
 376 EXPORT_SYMBOL_GPL(kvm_set_cr4);
 377
 378 void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 379 {
 380         if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) {
 381                 kvm_mmu_sync_roots(vcpu);
 382                 kvm_mmu_flush_tlb(vcpu);
 383                 return;
 384         }
 385
 386         if (is_long_mode(vcpu)) {
 387                 if (cr3 & CR3_L_MODE_RESERVED_BITS) {
 388                         printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n");
 389                         kvm_inject_gp(vcpu, 0);
 390                         return;
 391                 }
 392         } else {
 393                 if (is_pae(vcpu)) {
 394                         if (cr3 & CR3_PAE_RESERVED_BITS) {
 395                                 printk(KERN_DEBUG
 396                                        "set_cr3: #GP, reserved bits\n");
 397                                 kvm_inject_gp(vcpu, 0);
 398                                 return;
 399                         }
 400                         if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) {
 401                                 printk(KERN_DEBUG "set_cr3: #GP, pdptrs "
 402                                        "reserved bits\n");
 403                                 kvm_inject_gp(vcpu, 0);
 404                                 return;
 405                         }
 406                 }
 407                 /*
 408                  * We don't check reserved bits in nonpae mode, because
 409                  * this isn't enforced, and VMware depends on this.
 410                  */
 411         }
 412
 413         /*
 414          * Does the new cr3 value map to physical memory? (Note, we
 415          * catch an invalid cr3 even in real-mode, because it would
 416          * cause trouble later on when we turn on paging anyway.)
 417          *
 418          * A real CPU would silently accept an invalid cr3 and would
 419          * attempt to use it - with largely undefined (and often hard
 420          * to debug) behavior on the guest side.
 421          */
 422         if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
 423                 kvm_inject_gp(vcpu, 0);
 424         else {
 425                 vcpu->arch.cr3 = cr3;
 426                 vcpu->arch.mmu.new_cr3(vcpu);
 427         }
 428 }
 429 EXPORT_SYMBOL_GPL(kvm_set_cr3);
 430
 431 void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
 432 {
 433         if (cr8 & CR8_RESERVED_BITS) {
 434                 printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8);
 435                 kvm_inject_gp(vcpu, 0);
 436                 return;
 437         }
 438         if (irqchip_in_kernel(vcpu->kvm))
 439                 kvm_lapic_set_tpr(vcpu, cr8);
 440         else
 441                 vcpu->arch.cr8 = cr8;
 442 }
 443 EXPORT_SYMBOL_GPL(kvm_set_cr8);
 444
 445 unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
 446 {
 447         if (irqchip_in_kernel(vcpu->kvm))
 448                 return kvm_lapic_get_cr8(vcpu);
 449         else
 450                 return vcpu->arch.cr8;
 451 }
 452 EXPORT_SYMBOL_GPL(kvm_get_cr8);
 453
 454 static inline u32 bit(int bitno)
 455 {
 456         return 1 << (bitno & 31);
 457 }
 458
 459 /*
 460  * List of msr numbers which we expose to userspace through KVM_GET_MSRS
 461  * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
 462  *
 463  * This list is modified at module load time to reflect the
 464  * capabilities of the host cpu.
 465  */
 466 static u32 msrs_to_save[] = {
 467         MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
 468         MSR_K6_STAR,
 469 #ifdef CONFIG_X86_64
 470         MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
 471 #endif
 472         MSR_IA32_TIME_STAMP_COUNTER, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
 473         MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
 474 };
 475
 476 static unsigned num_msrs_to_save;
 477
 478 static u32 emulated_msrs[] = {
 479         MSR_IA32_MISC_ENABLE,
 480 };
 481
 482 static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
 483 {
 484         if (efer & efer_reserved_bits) {
 485                 printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n",
 486                        efer);
 487                 kvm_inject_gp(vcpu, 0);
 488                 return;
 489         }
 490
 491         if (is_paging(vcpu)
 492             && (vcpu->arch.shadow_efer & EFER_LME) != (efer & EFER_LME)) {
 493                 printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n");
 494                 kvm_inject_gp(vcpu, 0);
 495                 return;
 496         }
 497
 498         if (efer & EFER_FFXSR) {
 499                 struct kvm_cpuid_entry2 *feat;
 500
 501                 feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
 502                 if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT))) {
 503                         printk(KERN_DEBUG "set_efer: #GP, enable FFXSR w/o CPUID capability\n");
 504                         kvm_inject_gp(vcpu, 0);
 505                         return;
 506                 }
 507         }
 508
 509         if (efer & EFER_SVME) {
 510                 struct kvm_cpuid_entry2 *feat;
 511
 512                 feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
 513                 if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM))) {
 514                         printk(KERN_DEBUG "set_efer: #GP, enable SVM w/o SVM\n");
 515                         kvm_inject_gp(vcpu, 0);
 516                         return;
 517                 }
 518         }
 519
 520         kvm_x86_ops->set_efer(vcpu, efer);
 521
 522         efer &= ~EFER_LMA;
 523         efer |= vcpu->arch.shadow_efer & EFER_LMA;
 524
 525         vcpu->arch.shadow_efer = efer;
 526
 527         vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled;
 528         kvm_mmu_reset_context(vcpu);
 529 }
 530
 531 void kvm_enable_efer_bits(u64 mask)
 532 {
 533        efer_reserved_bits &= ~mask;
 534 }
 535 EXPORT_SYMBOL_GPL(kvm_enable_efer_bits);
 536
 537
 538 /*
 539  * Writes msr value into into the appropriate "register".
 540  * Returns 0 on success, non-0 otherwise.
 541  * Assumes vcpu_load() was already called.
 542  */
 543 int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
 544 {
 545         return kvm_x86_ops->set_msr(vcpu, msr_index, data);
 546 }
 547
 548 /*
 549  * Adapt set_msr() to msr_io()'s calling convention
 550  */
 551 static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
 552 {
 553         return kvm_set_msr(vcpu, index, *data);
 554 }
 555
 556 static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
 557 {
 558         static int version;
 559         struct pvclock_wall_clock wc;
 560         struct timespec now, sys, boot;
 561
 562         if (!wall_clock)
 563                 return;
 564
 565         version++;
 566
 567         kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
 568
 569         /*
 570          * The guest calculates current wall clock time by adding
 571          * system time (updated by kvm_write_guest_time below) to the
 572          * wall clock specified here.  guest system time equals host
 573          * system time for us, thus we must fill in host boot time here.
 574          */
 575         now = current_kernel_time();
 576         ktime_get_ts(&sys);
 577         boot = ns_to_timespec(timespec_to_ns(&now) - timespec_to_ns(&sys));
 578
 579         wc.sec = boot.tv_sec;
 580         wc.nsec = boot.tv_nsec;
 581         wc.version = version;
 582
 583         kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc));
 584
 585         version++;
 586         kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
 587 }
 588
 589 static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
 590 {
 591         uint32_t quotient, remainder;
 592
 593         /* Don't try to replace with do_div(), this one calculates
 594          * "(dividend << 32) / divisor" */
 595         __asm__ ( "divl %4"
 596                   : "=a" (quotient), "=d" (remainder)
 597                   : "0" (0), "1" (dividend), "r" (divisor) );
 598         return quotient;
 599 }
 600
 601 static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *hv_clock)
 602 {
 603         uint64_t nsecs = 1000000000LL;
 604         int32_t  shift = 0;
 605         uint64_t tps64;
 606         uint32_t tps32;
 607
 608         tps64 = tsc_khz * 1000LL;
 609         while (tps64 > nsecs*2) {
 610                 tps64 >>= 1;
 611                 shift--;
 612         }
 613
 614         tps32 = (uint32_t)tps64;
 615         while (tps32 <= (uint32_t)nsecs) {
 616                 tps32 <<= 1;
 617                 shift++;
 618         }
 619
 620         hv_clock->tsc_shift = shift;
 621         hv_clock->tsc_to_system_mul = div_frac(nsecs, tps32);
 622
 623         pr_debug("%s: tsc_khz %u, tsc_shift %d, tsc_mul %u\n",
 624                  __func__, tsc_khz, hv_clock->tsc_shift,
 625                  hv_clock->tsc_to_system_mul);
 626 }
 627
 628 static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
 629
 630 static void kvm_write_guest_time(struct kvm_vcpu *v)
 631 {
 632         struct timespec ts;
 633         unsigned long flags;
 634         struct kvm_vcpu_arch *vcpu = &v->arch;
 635         void *shared_kaddr;
 636
 637         if ((!vcpu->time_page))
 638                 return;
 639
 640         preempt_disable();
 641         if (unlikely(vcpu->hv_clock_tsc_khz != __get_cpu_var(cpu_tsc_khz))) {
 642                 kvm_set_time_scale(__get_cpu_var(cpu_tsc_khz), &vcpu->hv_clock);
 643                 vcpu->hv_clock_tsc_khz = __get_cpu_var(cpu_tsc_khz);
 644         }
 645         preempt_enable();
 646
 647         /* Keep irq disabled to prevent changes to the clock */
 648         local_irq_save(flags);
 649         kvm_get_msr(v, MSR_IA32_TIME_STAMP_COUNTER,
 650                           &vcpu->hv_clock.tsc_timestamp);
 651         ktime_get_ts(&ts);
 652         local_irq_restore(flags);
 653
 654         /* With all the info we got, fill in the values */
 655
 656         vcpu->hv_clock.system_time = ts.tv_nsec +
 657                                      (NSEC_PER_SEC * (u64)ts.tv_sec);
 658         /*
 659          * The interface expects us to write an even number signaling that the
 660          * update is finished. Since the guest won't see the intermediate
 661          * state, we just increase by 2 at the end.
 662          */
 663         vcpu->hv_clock.version += 2;
 664
 665         shared_kaddr = kmap_atomic(vcpu->time_page, KM_USER0);
 666
 667         memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock,
 668                sizeof(vcpu->hv_clock));
 669
 670         kunmap_atomic(shared_kaddr, KM_USER0);
 671
 672         mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT);
 673 }
 674
 675 static int kvm_request_guest_time_update(struct kvm_vcpu *v)
 676 {
 677         struct kvm_vcpu_arch *vcpu = &v->arch;
 678
 679         if (!vcpu->time_page)
 680                 return 0;
 681         set_bit(KVM_REQ_KVMCLOCK_UPDATE, &v->requests);
 682         return 1;
 683 }
 684
 685 static bool msr_mtrr_valid(unsigned msr)
 686 {
 687         switch (msr) {
 688         case 0x200 ... 0x200 + 2 * KVM_NR_VAR_MTRR - 1:
 689         case MSR_MTRRfix64K_00000:
 690         case MSR_MTRRfix16K_80000:
 691         case MSR_MTRRfix16K_A0000:
 692         case MSR_MTRRfix4K_C0000:
 693         case MSR_MTRRfix4K_C8000:
 694         case MSR_MTRRfix4K_D0000:
 695         case MSR_MTRRfix4K_D8000:
 696         case MSR_MTRRfix4K_E0000:
 697         case MSR_MTRRfix4K_E8000:
 698         case MSR_MTRRfix4K_F0000:
 699         case MSR_MTRRfix4K_F8000:
 700         case MSR_MTRRdefType:
 701         case MSR_IA32_CR_PAT:
 702                 return true;
 703         case 0x2f8:
 704                 return true;
 705         }
 706         return false;
 707 }
 708
 709 static bool valid_pat_type(unsigned t)
 710 {
 711         return t < 8 && (1 << t) & 0xf3; /* 0, 1, 4, 5, 6, 7 */
 712 }
 713
 714 static bool valid_mtrr_type(unsigned t)
 715 {
 716         return t < 8 && (1 << t) & 0x73; /* 0, 1, 4, 5, 6 */
 717 }
 718
 719 static bool mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 720 {
 721         int i;
 722
 723         if (!msr_mtrr_valid(msr))
 724                 return false;
 725
 726         if (msr == MSR_IA32_CR_PAT) {
 727                 for (i = 0; i < 8; i++)
 728                         if (!valid_pat_type((data >> (i * 8)) & 0xff))
 729                                 return false;
 730                 return true;
 731         } else if (msr == MSR_MTRRdefType) {
 732                 if (data & ~0xcff)
 733                         return false;
 734                 return valid_mtrr_type(data & 0xff);
 735         } else if (msr >= MSR_MTRRfix64K_00000 && msr <= MSR_MTRRfix4K_F8000) {
 736                 for (i = 0; i < 8 ; i++)
 737                         if (!valid_mtrr_type((data >> (i * 8)) & 0xff))
 738                                 return false;
 739                 return true;
 740         }
 741
 742         /* variable MTRRs */
 743         return valid_mtrr_type(data & 0xff);
 744 }
 745
 746 static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 747 {
 748         u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges;
 749
 750         if (!mtrr_valid(vcpu, msr, data))
 751                 return 1;
 752
 753         if (msr == MSR_MTRRdefType) {
 754                 vcpu->arch.mtrr_state.def_type = data;
 755                 vcpu->arch.mtrr_state.enabled = (data & 0xc00) >> 10;
 756         } else if (msr == MSR_MTRRfix64K_00000)
 757                 p[0] = data;
 758         else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000)
 759                 p[1 + msr - MSR_MTRRfix16K_80000] = data;
 760         else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000)
 761                 p[3 + msr - MSR_MTRRfix4K_C0000] = data;
 762         else if (msr == MSR_IA32_CR_PAT)
 763                 vcpu->arch.pat = data;
 764         else {  /* Variable MTRRs */
 765                 int idx, is_mtrr_mask;
 766                 u64 *pt;
 767
 768                 idx = (msr - 0x200) / 2;
 769                 is_mtrr_mask = msr - 0x200 - 2 * idx;
 770                 if (!is_mtrr_mask)
 771                         pt =
 772                           (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo;
 773                 else
 774                         pt =
 775                           (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo;
 776                 *pt = data;
 777         }
 778
 779         kvm_mmu_reset_context(vcpu);
 780         return 0;
 781 }
 782
 783 int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 784 {
 785         switch (msr) {
 786         case MSR_EFER:
 787                 set_efer(vcpu, data);
 788                 break;
 789         case MSR_IA32_MC0_STATUS:
 790                 pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n",
 791                        __func__, data);
 792                 break;
 793         case MSR_IA32_MCG_STATUS:
 794                 pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n",
 795                         __func__, data);
 796                 break;
 797         case MSR_IA32_MCG_CTL:
 798                 pr_unimpl(vcpu, "%s: MSR_IA32_MCG_CTL 0x%llx, nop\n",
 799                         __func__, data);
 800                 break;
 801         case MSR_IA32_DEBUGCTLMSR:
 802                 if (!data) {
 803                         /* We support the non-activated case already */
 804                         break;
 805                 } else if (data & ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_BTF)) {
 806                         /* Values other than LBR and BTF are vendor-specific,
 807                            thus reserved and should throw a #GP */
 808                         return 1;
 809                 }
 810                 pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
 811                         __func__, data);
 812                 break;
 813         case MSR_IA32_UCODE_REV:
 814         case MSR_IA32_UCODE_WRITE:
 815         case MSR_VM_HSAVE_PA:
 816                 break;
 817         case 0x200 ... 0x2ff:
 818                 return set_msr_mtrr(vcpu, msr, data);
 819         case MSR_IA32_APICBASE:
 820                 kvm_set_apic_base(vcpu, data);
 821                 break;
 822         case MSR_IA32_MISC_ENABLE:
 823                 vcpu->arch.ia32_misc_enable_msr = data;
 824                 break;
 825         case MSR_KVM_WALL_CLOCK:
 826                 vcpu->kvm->arch.wall_clock = data;
 827                 kvm_write_wall_clock(vcpu->kvm, data);
 828                 break;
 829         case MSR_KVM_SYSTEM_TIME: {
 830                 if (vcpu->arch.time_page) {
 831                         kvm_release_page_dirty(vcpu->arch.time_page);
 832                         vcpu->arch.time_page = NULL;
 833                 }
 834
 835                 vcpu->arch.time = data;
 836
 837                 /* we verify if the enable bit is set... */
 838                 if (!(data & 1))
 839                         break;
 840
 841                 /* ...but clean it before doing the actual write */
 842                 vcpu->arch.time_offset = data & ~(PAGE_MASK | 1);
 843
 844                 vcpu->arch.time_page =
 845                                 gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT);
 846
 847                 if (is_error_page(vcpu->arch.time_page)) {
 848                         kvm_release_page_clean(vcpu->arch.time_page);
 849                         vcpu->arch.time_page = NULL;
 850                 }
 851
 852                 kvm_request_guest_time_update(vcpu);
 853                 break;
 854         }
 855         default:
 856                 pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", msr, data);
 857                 return 1;
 858         }
 859         return 0;
 860 }
 861 EXPORT_SYMBOL_GPL(kvm_set_msr_common);
 862
 863
 864 /*
 865  * Reads an msr value (of 'msr_index') into 'pdata'.
 866  * Returns 0 on success, non-0 otherwise.
 867  * Assumes vcpu_load() was already called.
 868  */
 869 int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
 870 {
 871         return kvm_x86_ops->get_msr(vcpu, msr_index, pdata);
 872 }
 873
 874 static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 875 {
 876         u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges;
 877
 878         if (!msr_mtrr_valid(msr))
 879                 return 1;
 880
 881         if (msr == MSR_MTRRdefType)
 882                 *pdata = vcpu->arch.mtrr_state.def_type +
 883                          (vcpu->arch.mtrr_state.enabled << 10);
 884         else if (msr == MSR_MTRRfix64K_00000)
 885                 *pdata = p[0];
 886         else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000)
 887                 *pdata = p[1 + msr - MSR_MTRRfix16K_80000];
 888         else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000)
 889                 *pdata = p[3 + msr - MSR_MTRRfix4K_C0000];
 890         else if (msr == MSR_IA32_CR_PAT)
 891                 *pdata = vcpu->arch.pat;
 892         else {  /* Variable MTRRs */
 893                 int idx, is_mtrr_mask;
 894                 u64 *pt;
 895
 896                 idx = (msr - 0x200) / 2;
 897                 is_mtrr_mask = msr - 0x200 - 2 * idx;
 898                 if (!is_mtrr_mask)
 899                         pt =
 900                           (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo;
 901                 else
 902                         pt =
 903                           (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo;
 904                 *pdata = *pt;
 905         }
 906
 907         return 0;
 908 }
 909
 910 int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 911 {
 912         u64 data;
 913
 914         switch (msr) {
 915         case 0xc0010010: /* SYSCFG */
 916         case 0xc0010015: /* HWCR */
 917         case MSR_IA32_PLATFORM_ID:
 918         case MSR_IA32_P5_MC_ADDR:
 919         case MSR_IA32_P5_MC_TYPE:
 920         case MSR_IA32_MC0_CTL:
 921         case MSR_IA32_MCG_STATUS:
 922         case MSR_IA32_MCG_CAP:
 923         case MSR_IA32_MCG_CTL:
 924         case MSR_IA32_MC0_MISC:
 925         case MSR_IA32_MC0_MISC+4:
 926         case MSR_IA32_MC0_MISC+8:
 927         case MSR_IA32_MC0_MISC+12:
 928         case MSR_IA32_MC0_MISC+16:
 929         case MSR_IA32_MC0_MISC+20:
 930         case MSR_IA32_UCODE_REV:
 931         case MSR_IA32_EBL_CR_POWERON:
 932         case MSR_IA32_DEBUGCTLMSR:
 933         case MSR_IA32_LASTBRANCHFROMIP:
 934         case MSR_IA32_LASTBRANCHTOIP:
 935         case MSR_IA32_LASTINTFROMIP:
 936         case MSR_IA32_LASTINTTOIP:
 937         case MSR_VM_HSAVE_PA:
 938         case MSR_P6_EVNTSEL0:
 939         case MSR_P6_EVNTSEL1:
 940         case MSR_K7_EVNTSEL0:
 941                 data = 0;
 942                 break;
 943         case MSR_MTRRcap:
 944                 data = 0x500 | KVM_NR_VAR_MTRR;
 945                 break;
 946         case 0x200 ... 0x2ff:
 947                 return get_msr_mtrr(vcpu, msr, pdata);
 948         case 0xcd: /* fsb frequency */
 949                 data = 3;
 950                 break;
 951         case MSR_IA32_APICBASE:
 952                 data = kvm_get_apic_base(vcpu);
 953                 break;
 954         case MSR_IA32_MISC_ENABLE:
 955                 data = vcpu->arch.ia32_misc_enable_msr;
 956                 break;
 957         case MSR_IA32_PERF_STATUS:
 958                 /* TSC increment by tick */
 959                 data = 1000ULL;
 960                 /* CPU multiplier */
 961                 data |= (((uint64_t)4ULL) << 40);
 962                 break;
 963         case MSR_EFER:
 964                 data = vcpu->arch.shadow_efer;
 965                 break;
 966         case MSR_KVM_WALL_CLOCK:
 967                 data = vcpu->kvm->arch.wall_clock;
 968                 break;
 969         case MSR_KVM_SYSTEM_TIME:
 970                 data = vcpu->arch.time;
 971                 break;
 972         default:
 973                 pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
 974                 return 1;
 975         }
 976         *pdata = data;
 977         return 0;
 978 }
 979 EXPORT_SYMBOL_GPL(kvm_get_msr_common);
 980
 981 /*
 982  * Read or write a bunch of msrs. All parameters are kernel addresses.
 983  *
 984  * @return number of msrs set successfully.
 985  */
 986 static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
 987                     struct kvm_msr_entry *entries,
 988                     int (*do_msr)(struct kvm_vcpu *vcpu,
 989                                   unsigned index, u64 *data))
 990 {
 991         int i;
 992
 993         vcpu_load(vcpu);
 994
 995         down_read(&vcpu->kvm->slots_lock);
 996         for (i = 0; i < msrs->nmsrs; ++i)
 997                 if (do_msr(vcpu, entries[i].index, &entries[i].data))
 998                         break;
 999         up_read(&vcpu->kvm->slots_lock);
1000
1001         vcpu_put(vcpu);
1002
1003         return i;
1004 }
1005
1006 /*
1007  * Read or write a bunch of msrs. Parameters are user addresses.
1008  *
1009  * @return number of msrs set successfully.
1010  */
1011 static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
1012                   int (*do_msr)(struct kvm_vcpu *vcpu,
1013                                 unsigned index, u64 *data),
1014                   int writeback)
1015 {
1016         struct kvm_msrs msrs;
1017         struct kvm_msr_entry *entries;
1018         int r, n;
1019         unsigned size;
1020
1021         r = -EFAULT;
1022         if (copy_from_user(&msrs, user_msrs, sizeof msrs))
1023                 goto out;
1024
1025         r = -E2BIG;
1026         if (msrs.nmsrs >= MAX_IO_MSRS)
1027                 goto out;
1028
1029         r = -ENOMEM;
1030         size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
1031         entries = vmalloc(size);
1032         if (!entries)
1033                 goto out;
1034
1035         r = -EFAULT;
1036         if (copy_from_user(entries, user_msrs->entries, size))
1037                 goto out_free;
1038
1039         r = n = __msr_io(vcpu, &msrs, entries, do_msr);
1040         if (r < 0)
1041                 goto out_free;
1042
1043         r = -EFAULT;
1044         if (writeback && copy_to_user(user_msrs->entries, entries, size))
1045                 goto out_free;
1046
1047         r = n;
1048
1049 out_free:
1050         vfree(entries);
1051 out:
1052         return r;
1053 }
1054
1055 int kvm_dev_ioctl_check_extension(long ext)
1056 {
1057         int r;
1058
1059         switch (ext) {
1060         case KVM_CAP_IRQCHIP:
1061         case KVM_CAP_HLT:
1062         case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:
1063         case KVM_CAP_SET_TSS_ADDR:
1064         case KVM_CAP_EXT_CPUID:
1065         case KVM_CAP_CLOCKSOURCE:
1066         case KVM_CAP_PIT:
1067         case KVM_CAP_NOP_IO_DELAY:
1068         case KVM_CAP_MP_STATE:
1069         case KVM_CAP_SYNC_MMU:
1070         case KVM_CAP_REINJECT_CONTROL:
1071         case KVM_CAP_IRQ_INJECT_STATUS:
1072                 r = 1;
1073                 break;
1074         case KVM_CAP_COALESCED_MMIO:
1075                 r = KVM_COALESCED_MMIO_PAGE_OFFSET;
1076                 break;
1077         case KVM_CAP_VAPIC:
1078                 r = !kvm_x86_ops->cpu_has_accelerated_tpr();
1079                 break;
1080         case KVM_CAP_NR_VCPUS:
1081                 r = KVM_MAX_VCPUS;
1082                 break;
1083         case KVM_CAP_NR_MEMSLOTS:
1084                 r = KVM_MEMORY_SLOTS;
1085                 break;
1086         case KVM_CAP_PV_MMU:
1087                 r = !tdp_enabled;
1088                 break;
1089         case KVM_CAP_IOMMU:
1090                 r = iommu_found();
1091                 break;
1092         default:
1093                 r = 0;
1094                 break;
1095         }
1096         return r;
1097
1098 }
1099
1100 long kvm_arch_dev_ioctl(struct file *filp,
1101                         unsigned int ioctl, unsigned long arg)
1102 {
1103         void __user *argp = (void __user *)arg;
1104         long r;
1105
1106         switch (ioctl) {
1107         case KVM_GET_MSR_INDEX_LIST: {
1108                 struct kvm_msr_list __user *user_msr_list = argp;
1109                 struct kvm_msr_list msr_list;
1110                 unsigned n;
1111
1112                 r = -EFAULT;
1113                 if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list))
1114                         goto out;
1115                 n = msr_list.nmsrs;
1116                 msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs);
1117                 if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))
1118                         goto out;
1119                 r = -E2BIG;
1120                 if (n < msr_list.nmsrs)
1121                         goto out;
1122                 r = -EFAULT;
1123                 if (copy_to_user(user_msr_list->indices, &msrs_to_save,
1124                                  num_msrs_to_save * sizeof(u32)))
1125                         goto out;
1126                 if (copy_to_user(user_msr_list->indices + num_msrs_to_save,
1127                                  &emulated_msrs,
1128                                  ARRAY_SIZE(emulated_msrs) * sizeof(u32)))
1129                         goto out;
1130                 r = 0;
1131                 break;
1132         }
1133         case KVM_GET_SUPPORTED_CPUID: {
1134                 struct kvm_cpuid2 __user *cpuid_arg = argp;
1135                 struct kvm_cpuid2 cpuid;
1136
1137                 r = -EFAULT;
1138                 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
1139                         goto out;
1140                 r = kvm_dev_ioctl_get_supported_cpuid(&cpuid,
1141                                                       cpuid_arg->entries);
1142                 if (r)
1143                         goto out;
1144
1145                 r = -EFAULT;
1146                 if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
1147                         goto out;
1148                 r = 0;
1149                 break;
1150         }
1151         default:
1152                 r = -EINVAL;
1153         }
1154 out:
1155         return r;
1156 }
1157
1158 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1159 {
1160         kvm_x86_ops->vcpu_load(vcpu, cpu);
1161         kvm_request_guest_time_update(vcpu);
1162 }
1163
1164 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
1165 {
1166         kvm_x86_ops->vcpu_put(vcpu);
1167         kvm_put_guest_fpu(vcpu);
1168 }
1169
1170 static int is_efer_nx(void)
1171 {
1172         unsigned long long efer = 0;
1173
1174         rdmsrl_safe(MSR_EFER, &efer);
1175         return efer & EFER_NX;
1176 }
1177
1178 static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu)
1179 {
1180         int i;
1181         struct kvm_cpuid_entry2 *e, *entry;
1182
1183         entry = NULL;
1184         for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
1185                 e = &vcpu->arch.cpuid_entries[i];
1186                 if (e->function == 0x80000001) {
1187                         entry = e;
1188                         break;
1189                 }
1190         }
1191         if (entry && (entry->edx & (1 << 20)) && !is_efer_nx()) {
1192                 entry->edx &= ~(1 << 20);
1193                 printk(KERN_INFO "kvm: guest NX capability removed\n");
1194         }
1195 }
1196
1197 /* when an old userspace process fills a new kernel module */
1198 static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
1199                                     struct kvm_cpuid *cpuid,
1200                                     struct kvm_cpuid_entry __user *entries)
1201 {
1202         int r, i;
1203         struct kvm_cpuid_entry *cpuid_entries;
1204
1205         r = -E2BIG;
1206         if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
1207                 goto out;
1208         r = -ENOMEM;
1209         cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry) * cpuid->nent);
1210         if (!cpuid_entries)
1211                 goto out;
1212         r = -EFAULT;
1213         if (copy_from_user(cpuid_entries, entries,
1214                            cpuid->nent * sizeof(struct kvm_cpuid_entry)))
1215                 goto out_free;
1216         for (i = 0; i < cpuid->nent; i++) {
1217                 vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function;
1218                 vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax;
1219                 vcpu->arch.cpuid_entries[i].ebx = cpuid_entries[i].ebx;
1220                 vcpu->arch.cpuid_entries[i].ecx = cpuid_entries[i].ecx;
1221                 vcpu->arch.cpuid_entries[i].edx = cpuid_entries[i].edx;
1222                 vcpu->arch.cpuid_entries[i].index = 0;
1223                 vcpu->arch.cpuid_entries[i].flags = 0;
1224                 vcpu->arch.cpuid_entries[i].padding[0] = 0;
1225                 vcpu->arch.cpuid_entries[i].padding[1] = 0;
1226                 vcpu->arch.cpuid_entries[i].padding[2] = 0;
1227         }
1228         vcpu->arch.cpuid_nent = cpuid->nent;
1229         cpuid_fix_nx_cap(vcpu);
1230         r = 0;
1231
1232 out_free:
1233         vfree(cpuid_entries);
1234 out:
1235         return r;
1236 }
1237
1238 static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
1239                                      struct kvm_cpuid2 *cpuid,
1240                                      struct kvm_cpuid_entry2 __user *entries)
1241 {
1242         int r;
1243
1244         r = -E2BIG;
1245         if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
1246                 goto out;
1247         r = -EFAULT;
1248         if (copy_from_user(&vcpu->arch.cpuid_entries, entries,
1249                            cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
1250                 goto out;
1251         vcpu->arch.cpuid_nent = cpuid->nent;
1252         return 0;
1253
1254 out:
1255         return r;
1256 }
1257
1258 static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
1259                                      struct kvm_cpuid2 *cpuid,
1260                                      struct kvm_cpuid_entry2 __user *entries)
1261 {
1262         int r;
1263
1264         r = -E2BIG;
1265         if (cpuid->nent < vcpu->arch.cpuid_nent)
1266                 goto out;
1267         r = -EFAULT;
1268         if (copy_to_user(entries, &vcpu->arch.cpuid_entries,
1269                          vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2)))
1270                 goto out;
1271         return 0;
1272
1273 out:
1274         cpuid->nent = vcpu->arch.cpuid_nent;
1275         return r;
1276 }
1277
1278 static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1279                            u32 index)
1280 {
1281         entry->function = function;
1282         entry->index = index;
1283         cpuid_count(entry->function, entry->index,
1284                     &entry->eax, &entry->ebx, &entry->ecx, &entry->edx);
1285         entry->flags = 0;
1286 }
1287
1288 static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1289                          u32 index, int *nent, int maxnent)
1290 {
1291         const u32 kvm_supported_word0_x86_features = bit(X86_FEATURE_FPU) |
1292                 bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) |
1293                 bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) |
1294                 bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) |
1295                 bit(X86_FEATURE_MCE) |
1296                 bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) |
1297                 bit(X86_FEATURE_SEP) | bit(X86_FEATURE_MTRR) |
1298                 bit(X86_FEATURE_PGE) | bit(X86_FEATURE_MCA) |
1299                 bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PAT) |
1300                 bit(X86_FEATURE_PSE36) |
1301                 bit(X86_FEATURE_CLFLSH) | bit(X86_FEATURE_MMX) |
1302                 bit(X86_FEATURE_FXSR) | bit(X86_FEATURE_XMM) |
1303                 bit(X86_FEATURE_XMM2) | bit(X86_FEATURE_SELFSNOOP);
1304         const u32 kvm_supported_word1_x86_features = bit(X86_FEATURE_FPU) |
1305                 bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) |
1306                 bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) |
1307                 bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) |
1308                 bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) |
1309                 bit(X86_FEATURE_PGE) |
1310                 bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) |
1311                 bit(X86_FEATURE_MMX) | bit(X86_FEATURE_FXSR) |
1312                 bit(X86_FEATURE_SYSCALL) |
1313                 (is_efer_nx() ? bit(X86_FEATURE_NX) : 0) |
1314 #ifdef CONFIG_X86_64
1315                 bit(X86_FEATURE_LM) |
1316 #endif
1317                 bit(X86_FEATURE_FXSR_OPT) |
1318                 bit(X86_FEATURE_MMXEXT) |
1319                 bit(X86_FEATURE_3DNOWEXT) |
1320                 bit(X86_FEATURE_3DNOW);
1321         const u32 kvm_supported_word3_x86_features =
1322                 bit(X86_FEATURE_XMM3) | bit(X86_FEATURE_CX16);
1323         const u32 kvm_supported_word6_x86_features =
1324                 bit(X86_FEATURE_LAHF_LM) | bit(X86_FEATURE_CMP_LEGACY) |
1325                 bit(X86_FEATURE_SVM);
1326
1327         /* all calls to cpuid_count() should be made on the same cpu */
1328         get_cpu();
1329         do_cpuid_1_ent(entry, function, index);
1330         ++*nent;
1331
1332         switch (function) {
1333         case 0:
1334                 entry->eax = min(entry->eax, (u32)0xb);
1335                 break;
1336         case 1:
1337                 entry->edx &= kvm_supported_word0_x86_features;
1338                 entry->ecx &= kvm_supported_word3_x86_features;
1339                 break;
1340         /* function 2 entries are STATEFUL. That is, repeated cpuid commands
1341          * may return different values. This forces us to get_cpu() before
1342          * issuing the first command, and also to emulate this annoying behavior
1343          * in kvm_emulate_cpuid() using KVM_CPUID_FLAG_STATE_READ_NEXT */
1344         case 2: {
1345                 int t, times = entry->eax & 0xff;
1346
1347                 entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
1348                 entry->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
1349                 for (t = 1; t < times && *nent < maxnent; ++t) {
1350                         do_cpuid_1_ent(&entry[t], function, 0);
1351                         entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
1352                         ++*nent;
1353                 }
1354                 break;
1355         }
1356         /* function 4 and 0xb have additional index. */
1357         case 4: {
1358                 int i, cache_type;
1359
1360                 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1361                 /* read more entries until cache_type is zero */
1362                 for (i = 1; *nent < maxnent; ++i) {
1363                         cache_type = entry[i - 1].eax & 0x1f;
1364                         if (!cache_type)
1365                                 break;
1366                         do_cpuid_1_ent(&entry[i], function, i);
1367                         entry[i].flags |=
1368                                KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1369                         ++*nent;
1370                 }
1371                 break;
1372         }
1373         case 0xb: {
1374                 int i, level_type;
1375
1376                 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1377                 /* read more entries until level_type is zero */
1378                 for (i = 1; *nent < maxnent; ++i) {
1379                         level_type = entry[i - 1].ecx & 0xff00;
1380                         if (!level_type)
1381                                 break;
1382                         do_cpuid_1_ent(&entry[i], function, i);
1383                         entry[i].flags |=
1384                                KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1385                         ++*nent;
1386                 }
1387                 break;
1388         }
1389         case 0x80000000:
1390                 entry->eax = min(entry->eax, 0x8000001a);
1391                 break;
1392         case 0x80000001:
1393                 entry->edx &= kvm_supported_word1_x86_features;
1394                 entry->ecx &= kvm_supported_word6_x86_features;
1395                 break;
1396         }
1397         put_cpu();
1398 }
1399
1400 static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
1401                                      struct kvm_cpuid_entry2 __user *entries)
1402 {
1403         struct kvm_cpuid_entry2 *cpuid_entries;
1404         int limit, nent = 0, r = -E2BIG;
1405         u32 func;
1406
1407         if (cpuid->nent < 1)
1408                 goto out;
1409         r = -ENOMEM;
1410         cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent);
1411         if (!cpuid_entries)
1412                 goto out;
1413
1414         do_cpuid_ent(&cpuid_entries[0], 0, 0, &nent, cpuid->nent);
1415         limit = cpuid_entries[0].eax;
1416         for (func = 1; func <= limit && nent < cpuid->nent; ++func)
1417                 do_cpuid_ent(&cpuid_entries[nent], func, 0,
1418                              &nent, cpuid->nent);
1419         r = -E2BIG;
1420         if (nent >= cpuid->nent)
1421                 goto out_free;
1422
1423         do_cpuid_ent(&cpuid_entries[nent], 0x80000000, 0, &nent, cpuid->nent);
1424         limit = cpuid_entries[nent - 1].eax;
1425         for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func)
1426                 do_cpuid_ent(&cpuid_entries[nent], func, 0,
1427                              &nent, cpuid->nent);
1428         r = -EFAULT;
1429         if (copy_to_user(entries, cpuid_entries,
1430                          nent * sizeof(struct kvm_cpuid_entry2)))
1431                 goto out_free;
1432         cpuid->nent = nent;
1433         r = 0;
1434
1435 out_free:
1436         vfree(cpuid_entries);
1437 out:
1438         return r;
1439 }
1440
1441 static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
1442                                     struct kvm_lapic_state *s)
1443 {
1444         vcpu_load(vcpu);
1445         memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s);
1446         vcpu_put(vcpu);
1447
1448         return 0;
1449 }
1450
1451 static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
1452                                     struct kvm_lapic_state *s)
1453 {
1454         vcpu_load(vcpu);
1455         memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s);
1456         kvm_apic_post_state_restore(vcpu);
1457         vcpu_put(vcpu);
1458
1459         return 0;
1460 }
1461
1462 static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
1463                                     struct kvm_interrupt *irq)
1464 {
1465         if (irq->irq < 0 || irq->irq >= 256)
1466                 return -EINVAL;
1467         if (irqchip_in_kernel(vcpu->kvm))
1468                 return -ENXIO;
1469         vcpu_load(vcpu);
1470
1471         set_bit(irq->irq, vcpu->arch.irq_pending);
1472         set_bit(irq->irq / BITS_PER_LONG, &vcpu->arch.irq_summary);
1473
1474         vcpu_put(vcpu);
1475
1476         return 0;
1477 }
1478
1479 static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu)
1480 {
1481         vcpu_load(vcpu);
1482         kvm_inject_nmi(vcpu);
1483         vcpu_put(vcpu);
1484
1485         return 0;
1486 }
1487
1488 static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu,
1489                                            struct kvm_tpr_access_ctl *tac)
1490 {
1491         if (tac->flags)
1492                 return -EINVAL;
1493         vcpu->arch.tpr_access_reporting = !!tac->enabled;
1494         return 0;
1495 }
1496
1497 long kvm_arch_vcpu_ioctl(struct file *filp,
1498                          unsigned int ioctl, unsigned long arg)
1499 {
1500         struct kvm_vcpu *vcpu = filp->private_data;
1501         void __user *argp = (void __user *)arg;
1502         int r;
1503         struct kvm_lapic_state *lapic = NULL;
1504
1505         switch (ioctl) {
1506         case KVM_GET_LAPIC: {
1507                 lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
1508
1509                 r = -ENOMEM;
1510                 if (!lapic)
1511                         goto out;
1512                 r = kvm_vcpu_ioctl_get_lapic(vcpu, lapic);
1513                 if (r)
1514                         goto out;
1515                 r = -EFAULT;
1516                 if (copy_to_user(argp, lapic, sizeof(struct kvm_lapic_state)))
1517                         goto out;
1518                 r = 0;
1519                 break;
1520         }
1521         case KVM_SET_LAPIC: {
1522                 lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
1523                 r = -ENOMEM;
1524                 if (!lapic)
1525                         goto out;
1526                 r = -EFAULT;
1527                 if (copy_from_user(lapic, argp, sizeof(struct kvm_lapic_state)))
1528                         goto out;
1529                 r = kvm_vcpu_ioctl_set_lapic(vcpu, lapic);
1530                 if (r)
1531                         goto out;
1532                 r = 0;
1533                 break;
1534         }
1535         case KVM_INTERRUPT: {
1536                 struct kvm_interrupt irq;
1537
1538                 r = -EFAULT;
1539                 if (copy_from_user(&irq, argp, sizeof irq))
1540                         goto out;
1541                 r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
1542                 if (r)
1543                         goto out;
1544                 r = 0;
1545                 break;
1546         }
1547         case KVM_NMI: {
1548                 r = kvm_vcpu_ioctl_nmi(vcpu);
1549                 if (r)
1550                         goto out;
1551                 r = 0;
1552                 break;
1553         }
1554         case KVM_SET_CPUID: {
1555                 struct kvm_cpuid __user *cpuid_arg = argp;
1556                 struct kvm_cpuid cpuid;
1557
1558                 r = -EFAULT;
1559                 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
1560                         goto out;
1561                 r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
1562                 if (r)
1563                         goto out;
1564                 break;
1565         }
1566         case KVM_SET_CPUID2: {
1567                 struct kvm_cpuid2 __user *cpuid_arg = argp;
1568                 struct kvm_cpuid2 cpuid;
1569
1570                 r = -EFAULT;
1571                 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
1572                         goto out;
1573                 r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid,
1574                                               cpuid_arg->entries);
1575                 if (r)
1576                         goto out;
1577                 break;
1578         }
1579         case KVM_GET_CPUID2: {
1580                 struct kvm_cpuid2 __user *cpuid_arg = argp;
1581                 struct kvm_cpuid2 cpuid;
1582
1583                 r = -EFAULT;
1584                 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
1585                         goto out;
1586                 r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid,
1587                                               cpuid_arg->entries);
1588                 if (r)
1589                         goto out;
1590                 r = -EFAULT;
1591                 if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
1592                         goto out;
1593                 r = 0;
1594                 break;
1595         }
1596         case KVM_GET_MSRS:
1597                 r = msr_io(vcpu, argp, kvm_get_msr, 1);
1598                 break;
1599         case KVM_SET_MSRS:
1600                 r = msr_io(vcpu, argp, do_set_msr, 0);
1601                 break;
1602         case KVM_TPR_ACCESS_REPORTING: {
1603                 struct kvm_tpr_access_ctl tac;
1604
1605                 r = -EFAULT;
1606                 if (copy_from_user(&tac, argp, sizeof tac))
1607                         goto out;
1608                 r = vcpu_ioctl_tpr_access_reporting(vcpu, &tac);
1609                 if (r)
1610                         goto out;
1611                 r = -EFAULT;
1612                 if (copy_to_user(argp, &tac, sizeof tac))
1613                         goto out;
1614                 r = 0;
1615                 break;
1616         };
1617         case KVM_SET_VAPIC_ADDR: {
1618                 struct kvm_vapic_addr va;
1619
1620                 r = -EINVAL;
1621                 if (!irqchip_in_kernel(vcpu->kvm))
1622                         goto out;
1623                 r = -EFAULT;
1624                 if (copy_from_user(&va, argp, sizeof va))
1625                         goto out;
1626                 r = 0;
1627                 kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr);
1628                 break;
1629         }
1630         default:
1631                 r = -EINVAL;
1632         }
1633 out:
1634         if (lapic)
1635                 kfree(lapic);
1636         return r;
1637 }
1638
1639 static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr)
1640 {
1641         int ret;
1642
1643         if (addr > (unsigned int)(-3 * PAGE_SIZE))
1644                 return -1;
1645         ret = kvm_x86_ops->set_tss_addr(kvm, addr);
1646         return ret;
1647 }
1648
1649 static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
1650                                           u32 kvm_nr_mmu_pages)
1651 {
1652         if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES)
1653                 return -EINVAL;
1654
1655         down_write(&kvm->slots_lock);
1656         spin_lock(&kvm->mmu_lock);
1657
1658         kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);
1659         kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;
1660
1661         spin_unlock(&kvm->mmu_lock);
1662         up_write(&kvm->slots_lock);
1663         return 0;
1664 }
1665
1666 static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
1667 {
1668         return kvm->arch.n_alloc_mmu_pages;
1669 }
1670
1671 gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
1672 {
1673         int i;
1674         struct kvm_mem_alias *alias;
1675
1676         for (i = 0; i < kvm->arch.naliases; ++i) {
1677                 alias = &kvm->arch.aliases[i];
1678                 if (gfn >= alias->base_gfn
1679                     && gfn < alias->base_gfn + alias->npages)
1680                         return alias->target_gfn + gfn - alias->base_gfn;
1681         }
1682         return gfn;
1683 }
1684
1685 /*
1686  * Set a new alias region.  Aliases map a portion of physical memory into
1687  * another portion.  This is useful for memory windows, for example the PC
1688  * VGA region.
1689  */
1690 static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
1691                                          struct kvm_memory_alias *alias)
1692 {
1693         int r, n;
1694         struct kvm_mem_alias *p;
1695
1696         r = -EINVAL;
1697         /* General sanity checks */
1698         if (alias->memory_size & (PAGE_SIZE - 1))
1699                 goto out;
1700         if (alias->guest_phys_addr & (PAGE_SIZE - 1))
1701                 goto out;
1702         if (alias->slot >= KVM_ALIAS_SLOTS)
1703                 goto out;
1704         if (alias->guest_phys_addr + alias->memory_size
1705             < alias->guest_phys_addr)
1706                 goto out;
1707         if (alias->target_phys_addr + alias->memory_size
1708             < alias->target_phys_addr)
1709                 goto out;
1710
1711         down_write(&kvm->slots_lock);
1712         spin_lock(&kvm->mmu_lock);
1713
1714         p = &kvm->arch.aliases[alias->slot];
1715         p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT;
1716         p->npages = alias->memory_size >> PAGE_SHIFT;
1717         p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT;
1718
1719         for (n = KVM_ALIAS_SLOTS; n > 0; --n)
1720                 if (kvm->arch.aliases[n - 1].npages)
1721                         break;
1722         kvm->arch.naliases = n;
1723
1724         spin_unlock(&kvm->mmu_lock);
1725         kvm_mmu_zap_all(kvm);
1726
1727         up_write(&kvm->slots_lock);
1728
1729         return 0;
1730
1731 out:
1732         return r;
1733 }
1734
1735 static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
1736 {
1737         int r;
1738
1739         r = 0;
1740         switch (chip->chip_id) {
1741         case KVM_IRQCHIP_PIC_MASTER:
1742                 memcpy(&chip->chip.pic,
1743                         &pic_irqchip(kvm)->pics[0],
1744                         sizeof(struct kvm_pic_state));
1745                 break;
1746         case KVM_IRQCHIP_PIC_SLAVE:
1747                 memcpy(&chip->chip.pic,
1748                         &pic_irqchip(kvm)->pics[1],
1749                         sizeof(struct kvm_pic_state));
1750                 break;
1751         case KVM_IRQCHIP_IOAPIC:
1752                 memcpy(&chip->chip.ioapic,
1753                         ioapic_irqchip(kvm),
1754                         sizeof(struct kvm_ioapic_state));
1755                 break;
1756         default:
1757                 r = -EINVAL;
1758                 break;
1759         }
1760         return r;
1761 }
1762
1763 static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
1764 {
1765         int r;
1766
1767         r = 0;
1768         switch (chip->chip_id) {
1769         case KVM_IRQCHIP_PIC_MASTER:
1770                 memcpy(&pic_irqchip(kvm)->pics[0],
1771                         &chip->chip.pic,
1772                         sizeof(struct kvm_pic_state));
1773                 break;
1774         case KVM_IRQCHIP_PIC_SLAVE:
1775                 memcpy(&pic_irqchip(kvm)->pics[1],
1776                         &chip->chip.pic,
1777                         sizeof(struct kvm_pic_state));
1778                 break;
1779         case KVM_IRQCHIP_IOAPIC:
1780                 memcpy(ioapic_irqchip(kvm),
1781                         &chip->chip.ioapic,
1782                         sizeof(struct kvm_ioapic_state));
1783                 break;
1784         default:
1785                 r = -EINVAL;
1786                 break;
1787         }
1788         kvm_pic_update_irq(pic_irqchip(kvm));
1789         return r;
1790 }
1791
1792 static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps)
1793 {
1794         int r = 0;
1795
1796         memcpy(ps, &kvm->arch.vpit->pit_state, sizeof(struct kvm_pit_state));
1797         return r;
1798 }
1799
1800 static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
1801 {
1802         int r = 0;
1803
1804         memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state));
1805         kvm_pit_load_count(kvm, 0, ps->channels[0].count);
1806         return r;
1807 }
1808
1809 static int kvm_vm_ioctl_reinject(struct kvm *kvm,
1810                                  struct kvm_reinject_control *control)
1811 {
1812         if (!kvm->arch.vpit)
1813                 return -ENXIO;
1814         kvm->arch.vpit->pit_state.pit_timer.reinject = control->pit_reinject;
1815         return 0;
1816 }
1817
1818 /*
1819  * Get (and clear) the dirty memory log for a memory slot.
1820  */
1821 int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
1822                                       struct kvm_dirty_log *log)
1823 {
1824         int r;
1825         int n;
1826         struct kvm_memory_slot *memslot;
1827         int is_dirty = 0;
1828
1829         down_write(&kvm->slots_lock);
1830
1831         r = kvm_get_dirty_log(kvm, log, &is_dirty);
1832         if (r)
1833                 goto out;
1834
1835         /* If nothing is dirty, don't bother messing with page tables. */
1836         if (is_dirty) {
1837                 spin_lock(&kvm->mmu_lock);
1838                 kvm_mmu_slot_remove_write_access(kvm, log->slot);
1839                 spin_unlock(&kvm->mmu_lock);
1840                 kvm_flush_remote_tlbs(kvm);
1841                 memslot = &kvm->memslots[log->slot];
1842                 n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
1843                 memset(memslot->dirty_bitmap, 0, n);
1844         }
1845         r = 0;
1846 out:
1847         up_write(&kvm->slots_lock);
1848         return r;
1849 }
1850
1851 long kvm_arch_vm_ioctl(struct file *filp,
1852                        unsigned int ioctl, unsigned long arg)
1853 {
1854         struct kvm *kvm = filp->private_data;
1855         void __user *argp = (void __user *)arg;
1856         int r = -EINVAL;
1857         /*
1858          * This union makes it completely explicit to gcc-3.x
1859          * that these two variables' stack usage should be
1860          * combined, not added together.
1861          */
1862         union {
1863                 struct kvm_pit_state ps;
1864                 struct kvm_memory_alias alias;
1865         } u;
1866
1867         switch (ioctl) {
1868         case KVM_SET_TSS_ADDR:
1869                 r = kvm_vm_ioctl_set_tss_addr(kvm, arg);
1870                 if (r < 0)
1871                         goto out;
1872                 break;
1873         case KVM_SET_MEMORY_REGION: {
1874                 struct kvm_memory_region kvm_mem;
1875                 struct kvm_userspace_memory_region kvm_userspace_mem;
1876
1877                 r = -EFAULT;
1878                 if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem))
1879                         goto out;
1880                 kvm_userspace_mem.slot = kvm_mem.slot;
1881                 kvm_userspace_mem.flags = kvm_mem.flags;
1882                 kvm_userspace_mem.guest_phys_addr = kvm_mem.guest_phys_addr;
1883                 kvm_userspace_mem.memory_size = kvm_mem.memory_size;
1884                 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 0);
1885                 if (r)
1886                         goto out;
1887                 break;
1888         }
1889         case KVM_SET_NR_MMU_PAGES:
1890                 r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);
1891                 if (r)
1892                         goto out;
1893                 break;
1894         case KVM_GET_NR_MMU_PAGES:
1895                 r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
1896                 break;
1897         case KVM_SET_MEMORY_ALIAS:
1898                 r = -EFAULT;
1899                 if (copy_from_user(&u.alias, argp, sizeof(struct kvm_memory_alias)))
1900                         goto out;
1901                 r = kvm_vm_ioctl_set_memory_alias(kvm, &u.alias);
1902                 if (r)
1903                         goto out;
1904                 break;
1905         case KVM_CREATE_IRQCHIP:
1906                 r = -ENOMEM;
1907                 kvm->arch.vpic = kvm_create_pic(kvm);
1908                 if (kvm->arch.vpic) {
1909                         r = kvm_ioapic_init(kvm);
1910                         if (r) {
1911                                 kfree(kvm->arch.vpic);
1912                                 kvm->arch.vpic = NULL;
1913                                 goto out;
1914                         }
1915                 } else
1916                         goto out;
1917                 r = kvm_setup_default_irq_routing(kvm);
1918                 if (r) {
1919                         kfree(kvm->arch.vpic);
1920                         kfree(kvm->arch.vioapic);
1921                         goto out;
1922                 }
1923                 break;
1924         case KVM_CREATE_PIT:
1925                 mutex_lock(&kvm->lock);
1926                 r = -EEXIST;
1927                 if (kvm->arch.vpit)
1928                         goto create_pit_unlock;
1929                 r = -ENOMEM;
1930                 kvm->arch.vpit = kvm_create_pit(kvm);
1931                 if (kvm->arch.vpit)
1932                         r = 0;
1933         create_pit_unlock:
1934                 mutex_unlock(&kvm->lock);
1935                 break;
1936         case KVM_IRQ_LINE_STATUS:
1937         case KVM_IRQ_LINE: {
1938                 struct kvm_irq_level irq_event;
1939
1940                 r = -EFAULT;
1941                 if (copy_from_user(&irq_event, argp, sizeof irq_event))
1942                         goto out;
1943                 if (irqchip_in_kernel(kvm)) {
1944                         __s32 status;
1945                         mutex_lock(&kvm->lock);
1946                         status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
1947                                         irq_event.irq, irq_event.level);
1948                         mutex_unlock(&kvm->lock);
1949                         if (ioctl == KVM_IRQ_LINE_STATUS) {
1950                                 irq_event.status = status;
1951                                 if (copy_to_user(argp, &irq_event,
1952                                                         sizeof irq_event))
1953                                         goto out;
1954                         }
1955                         r = 0;
1956                 }
1957                 break;
1958         }
1959         case KVM_GET_IRQCHIP: {
1960                 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
1961                 struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL);
1962
1963                 r = -ENOMEM;
1964                 if (!chip)
1965                         goto out;
1966                 r = -EFAULT;
1967                 if (copy_from_user(chip, argp, sizeof *chip))
1968                         goto get_irqchip_out;
1969                 r = -ENXIO;
1970                 if (!irqchip_in_kernel(kvm))
1971                         goto get_irqchip_out;
1972                 r = kvm_vm_ioctl_get_irqchip(kvm, chip);
1973                 if (r)
1974                         goto get_irqchip_out;
1975                 r = -EFAULT;
1976                 if (copy_to_user(argp, chip, sizeof *chip))
1977                         goto get_irqchip_out;
1978                 r = 0;
1979         get_irqchip_out:
1980                 kfree(chip);
1981                 if (r)
1982                         goto out;
1983                 break;
1984         }
1985         case KVM_SET_IRQCHIP: {
1986                 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
1987                 struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL);
1988
1989                 r = -ENOMEM;
1990                 if (!chip)
1991                         goto out;
1992                 r = -EFAULT;
1993                 if (copy_from_user(chip, argp, sizeof *chip))
1994                         goto set_irqchip_out;
1995                 r = -ENXIO;
1996                 if (!irqchip_in_kernel(kvm))
1997                         goto set_irqchip_out;
1998                 r = kvm_vm_ioctl_set_irqchip(kvm, chip);
1999                 if (r)
2000                         goto set_irqchip_out;
2001                 r = 0;
2002         set_irqchip_out:
2003                 kfree(chip);
2004                 if (r)
2005                         goto out;
2006                 break;
2007         }
2008         case KVM_GET_PIT: {
2009                 r = -EFAULT;
2010                 if (copy_from_user(&u.ps, argp, sizeof(struct kvm_pit_state)))
2011                         goto out;
2012                 r = -ENXIO;
2013                 if (!kvm->arch.vpit)
2014                         goto out;
2015                 r = kvm_vm_ioctl_get_pit(kvm, &u.ps);
2016                 if (r)
2017                         goto out;
2018                 r = -EFAULT;
2019                 if (copy_to_user(argp, &u.ps, sizeof(struct kvm_pit_state)))
2020                         goto out;
2021                 r = 0;
2022                 break;
2023         }
2024         case KVM_SET_PIT: {
2025                 r = -EFAULT;
2026                 if (copy_from_user(&u.ps, argp, sizeof u.ps))
2027                         goto out;
2028                 r = -ENXIO;
2029                 if (!kvm->arch.vpit)
2030                         goto out;
2031                 r = kvm_vm_ioctl_set_pit(kvm, &u.ps);
2032                 if (r)
2033                         goto out;
2034                 r = 0;
2035                 break;
2036         }
2037         case KVM_REINJECT_CONTROL: {
2038                 struct kvm_reinject_control control;
2039                 r =  -EFAULT;
2040                 if (copy_from_user(&control, argp, sizeof(control)))
2041                         goto out;
2042                 r = kvm_vm_ioctl_reinject(kvm, &control);
2043                 if (r)
2044                         goto out;
2045                 r = 0;
2046                 break;
2047         }
2048         default:
2049                 ;
2050         }
2051 out:
2052         return r;
2053 }
2054
2055 static void kvm_init_msr_list(void)
2056 {
2057         u32 dummy[2];
2058         unsigned i, j;
2059
2060         for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) {
2061                 if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
2062                         continue;
2063                 if (j < i)
2064                         msrs_to_save[j] = msrs_to_save[i];
2065                 j++;
2066         }
2067         num_msrs_to_save = j;
2068 }
2069
2070 /*
2071  * Only apic need an MMIO device hook, so shortcut now..
2072  */
2073 static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu,
2074                                                 gpa_t addr, int len,
2075                                                 int is_write)
2076 {
2077         struct kvm_io_device *dev;
2078
2079         if (vcpu->arch.apic) {
2080                 dev = &vcpu->arch.apic->dev;
2081                 if (dev->in_range(dev, addr, len, is_write))
2082                         return dev;
2083         }
2084         return NULL;
2085 }
2086
2087
2088 static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu,
2089                                                 gpa_t addr, int len,
2090                                                 int is_write)
2091 {
2092         struct kvm_io_device *dev;
2093
2094         dev = vcpu_find_pervcpu_dev(vcpu, addr, len, is_write);
2095         if (dev == NULL)
2096                 dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr, len,
2097                                           is_write);
2098         return dev;
2099 }
2100
2101 static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes,
2102                                struct kvm_vcpu *vcpu)
2103 {
2104         void *data = val;
2105         int r = X86EMUL_CONTINUE;
2106
2107         while (bytes) {
2108                 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
2109                 unsigned offset = addr & (PAGE_SIZE-1);
2110                 unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset);
2111                 int ret;
2112
2113                 if (gpa == UNMAPPED_GVA) {
2114                         r = X86EMUL_PROPAGATE_FAULT;
2115                         goto out;
2116                 }
2117                 ret = kvm_read_guest(vcpu->kvm, gpa, data, toread);
2118                 if (ret < 0) {
2119                         r = X86EMUL_UNHANDLEABLE;
2120                         goto out;
2121                 }
2122
2123                 bytes -= toread;
2124                 data += toread;
2125                 addr += toread;
2126         }
2127 out:
2128         return r;
2129 }
2130
2131 static int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes,
2132                                 struct kvm_vcpu *vcpu)
2133 {
2134         void *data = val;
2135         int r = X86EMUL_CONTINUE;
2136
2137         while (bytes) {
2138                 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
2139                 unsigned offset = addr & (PAGE_SIZE-1);
2140                 unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
2141                 int ret;
2142
2143                 if (gpa == UNMAPPED_GVA) {
2144                         r = X86EMUL_PROPAGATE_FAULT;
2145                         goto out;
2146                 }
2147                 ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite);
2148                 if (ret < 0) {
2149                         r = X86EMUL_UNHANDLEABLE;
2150                         goto out;
2151                 }
2152
2153                 bytes -= towrite;
2154                 data += towrite;
2155                 addr += towrite;
2156         }
2157 out:
2158         return r;
2159 }
2160
2161
2162 static int emulator_read_emulated(unsigned long addr,
2163                                   void *val,
2164                                   unsigned int bytes,
2165                                   struct kvm_vcpu *vcpu)
2166 {
2167         struct kvm_io_device *mmio_dev;
2168         gpa_t                 gpa;
2169
2170         if (vcpu->mmio_read_completed) {
2171                 memcpy(val, vcpu->mmio_data, bytes);
2172                 vcpu->mmio_read_completed = 0;
2173                 return X86EMUL_CONTINUE;
2174         }
2175
2176         gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
2177
2178         /* For APIC access vmexit */
2179         if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
2180                 goto mmio;
2181
2182         if (kvm_read_guest_virt(addr, val, bytes, vcpu)
2183                                 == X86EMUL_CONTINUE)
2184                 return X86EMUL_CONTINUE;
2185         if (gpa == UNMAPPED_GVA)
2186                 return X86EMUL_PROPAGATE_FAULT;
2187
2188 mmio:
2189         /*
2190          * Is this MMIO handled locally?
2191          */
2192         mutex_lock(&vcpu->kvm->lock);
2193         mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 0);
2194         if (mmio_dev) {
2195                 kvm_iodevice_read(mmio_dev, gpa, bytes, val);
2196                 mutex_unlock(&vcpu->kvm->lock);
2197                 return X86EMUL_CONTINUE;
2198         }
2199         mutex_unlock(&vcpu->kvm->lock);
2200
2201         vcpu->mmio_needed = 1;
2202         vcpu->mmio_phys_addr = gpa;
2203         vcpu->mmio_size = bytes;
2204         vcpu->mmio_is_write = 0;
2205
2206         return X86EMUL_UNHANDLEABLE;
2207 }
2208
2209 int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
2210                           const void *val, int bytes)
2211 {
2212         int ret;
2213
2214         ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes);
2215         if (ret < 0)
2216                 return 0;
2217         kvm_mmu_pte_write(vcpu, gpa, val, bytes, 1);
2218         return 1;
2219 }
2220
2221 static int emulator_write_emulated_onepage(unsigned long addr,
2222                                            const void *val,
2223                                            unsigned int bytes,
2224                                            struct kvm_vcpu *vcpu)
2225 {
2226         struct kvm_io_device *mmio_dev;
2227         gpa_t                 gpa;
2228
2229         gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
2230
2231         if (gpa == UNMAPPED_GVA) {
2232                 kvm_inject_page_fault(vcpu, addr, 2);
2233                 return X86EMUL_PROPAGATE_FAULT;
2234         }
2235
2236         /* For APIC access vmexit */
2237         if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
2238                 goto mmio;
2239
2240         if (emulator_write_phys(vcpu, gpa, val, bytes))
2241                 return X86EMUL_CONTINUE;
2242
2243 mmio:
2244         /*
2245          * Is this MMIO handled locally?
2246          */
2247         mutex_lock(&vcpu->kvm->lock);
2248         mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 1);
2249         if (mmio_dev) {
2250                 kvm_iodevice_write(mmio_dev, gpa, bytes, val);
2251                 mutex_unlock(&vcpu->kvm->lock);
2252                 return X86EMUL_CONTINUE;
2253         }
2254         mutex_unlock(&vcpu->kvm->lock);
2255
2256         vcpu->mmio_needed = 1;
2257         vcpu->mmio_phys_addr = gpa;
2258         vcpu->mmio_size = bytes;
2259         vcpu->mmio_is_write = 1;
2260         memcpy(vcpu->mmio_data, val, bytes);
2261
2262         return X86EMUL_CONTINUE;
2263 }
2264
2265 int emulator_write_emulated(unsigned long addr,
2266                                    const void *val,
2267                                    unsigned int bytes,
2268                                    struct kvm_vcpu *vcpu)
2269 {
2270         /* Crossing a page boundary? */
2271         if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {
2272                 int rc, now;
2273
2274                 now = -addr & ~PAGE_MASK;
2275                 rc = emulator_write_emulated_onepage(addr, val, now, vcpu);
2276                 if (rc != X86EMUL_CONTINUE)
2277                         return rc;
2278                 addr += now;
2279                 val += now;
2280                 bytes -= now;
2281         }
2282         return emulator_write_emulated_onepage(addr, val, bytes, vcpu);
2283 }
2284 EXPORT_SYMBOL_GPL(emulator_write_emulated);
2285
2286 static int emulator_cmpxchg_emulated(unsigned long addr,
2287                                      const void *old,
2288                                      const void *new,
2289                                      unsigned int bytes,
2290                                      struct kvm_vcpu *vcpu)
2291 {
2292         static int reported;
2293
2294         if (!reported) {
2295                 reported = 1;
2296                 printk(KERN_WARNING "kvm: emulating exchange as write\n");
2297         }
2298 #ifndef CONFIG_X86_64
2299         /* guests cmpxchg8b have to be emulated atomically */
2300         if (bytes == 8) {
2301                 gpa_t gpa;
2302                 struct page *page;
2303                 char *kaddr;
2304                 u64 val;
2305
2306                 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
2307
2308                 if (gpa == UNMAPPED_GVA ||
2309                    (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
2310                         goto emul_write;
2311
2312                 if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK))
2313                         goto emul_write;
2314
2315                 val = *(u64 *)new;
2316
2317                 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
2318
2319                 kaddr = kmap_atomic(page, KM_USER0);
2320                 set_64bit((u64 *)(kaddr + offset_in_page(gpa)), val);
2321                 kunmap_atomic(kaddr, KM_USER0);
2322                 kvm_release_page_dirty(page);
2323         }
2324 emul_write:
2325 #endif
2326
2327         return emulator_write_emulated(addr, new, bytes, vcpu);
2328 }
2329
2330 static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
2331 {
2332         return kvm_x86_ops->get_segment_base(vcpu, seg);
2333 }
2334
2335 int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
2336 {
2337         kvm_mmu_invlpg(vcpu, address);
2338         return X86EMUL_CONTINUE;
2339 }
2340
2341 int emulate_clts(struct kvm_vcpu *vcpu)
2342 {
2343         KVMTRACE_0D(CLTS, vcpu, handler);
2344         kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS);
2345         return X86EMUL_CONTINUE;
2346 }
2347
2348 int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest)
2349 {
2350         struct kvm_vcpu *vcpu = ctxt->vcpu;
2351
2352         switch (dr) {
2353         case 0 ... 3:
2354                 *dest = kvm_x86_ops->get_dr(vcpu, dr);
2355                 return X86EMUL_CONTINUE;
2356         default:
2357                 pr_unimpl(vcpu, "%s: unexpected dr %u\n", __func__, dr);
2358                 return X86EMUL_UNHANDLEABLE;
2359         }
2360 }
2361
2362 int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
2363 {
2364         unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U;
2365         int exception;
2366
2367         kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask, &exception);
2368         if (exception) {
2369                 /* FIXME: better handling */
2370                 return X86EMUL_UNHANDLEABLE;
2371         }
2372         return X86EMUL_CONTINUE;
2373 }
2374
2375 void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
2376 {
2377         u8 opcodes[4];
2378         unsigned long rip = kvm_rip_read(vcpu);
2379         unsigned long rip_linear;
2380
2381         if (!printk_ratelimit())
2382                 return;
2383
2384         rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS);
2385
2386         kvm_read_guest_virt(rip_linear, (void *)opcodes, 4, vcpu);
2387
2388         printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n",
2389                context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
2390 }
2391 EXPORT_SYMBOL_GPL(kvm_report_emulation_failure);
2392
2393 static struct x86_emulate_ops emulate_ops = {
2394         .read_std            = kvm_read_guest_virt,
2395         .read_emulated       = emulator_read_emulated,
2396         .write_emulated      = emulator_write_emulated,
2397         .cmpxchg_emulated    = emulator_cmpxchg_emulated,
2398 };
2399
2400 static void cache_all_regs(struct kvm_vcpu *vcpu)
2401 {
2402         kvm_register_read(vcpu, VCPU_REGS_RAX);
2403         kvm_register_read(vcpu, VCPU_REGS_RSP);
2404         kvm_register_read(vcpu, VCPU_REGS_RIP);
2405         vcpu->arch.regs_dirty = ~0;
2406 }
2407
2408 int emulate_instruction(struct kvm_vcpu *vcpu,
2409                         struct kvm_run *run,
2410                         unsigned long cr2,
2411                         u16 error_code,
2412                         int emulation_type)
2413 {
2414         int r, shadow_mask;
2415         struct decode_cache *c;
2416
2417         kvm_clear_exception_queue(vcpu);
2418         vcpu->arch.mmio_fault_cr2 = cr2;
2419         /*
2420          * TODO: fix x86_emulate.c to use guest_read/write_register
2421          * instead of direct ->regs accesses, can save hundred cycles
2422          * on Intel for instructions that don't read/change RSP, for
2423          * for example.
2424          */
2425         cache_all_regs(vcpu);
2426
2427         vcpu->mmio_is_write = 0;
2428         vcpu->arch.pio.string = 0;
2429
2430         if (!(emulation_type & EMULTYPE_NO_DECODE)) {
2431                 int cs_db, cs_l;
2432                 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
2433
2434                 vcpu->arch.emulate_ctxt.vcpu = vcpu;
2435                 vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
2436                 vcpu->arch.emulate_ctxt.mode =
2437                         (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
2438                         ? X86EMUL_MODE_REAL : cs_l
2439                         ? X86EMUL_MODE_PROT64 : cs_db
2440                         ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
2441
2442                 r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
2443
2444                 /* Reject the instructions other than VMCALL/VMMCALL when
2445                  * try to emulate invalid opcode */
2446                 c = &vcpu->arch.emulate_ctxt.decode;
2447                 if ((emulation_type & EMULTYPE_TRAP_UD) &&
2448                     (!(c->twobyte && c->b == 0x01 &&
2449                       (c->modrm_reg == 0 || c->modrm_reg == 3) &&
2450                        c->modrm_mod == 3 && c->modrm_rm == 1)))
2451                         return EMULATE_FAIL;
2452
2453                 ++vcpu->stat.insn_emulation;
2454                 if (r)  {
2455                         ++vcpu->stat.insn_emulation_fail;
2456                         if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
2457                                 return EMULATE_DONE;
2458                         return EMULATE_FAIL;
2459                 }
2460         }
2461
2462         r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
2463         shadow_mask = vcpu->arch.emulate_ctxt.interruptibility;
2464
2465         if (r == 0)
2466                 kvm_x86_ops->set_interrupt_shadow(vcpu, shadow_mask);
2467
2468         if (vcpu->arch.pio.string)
2469                 return EMULATE_DO_MMIO;
2470
2471         if ((r || vcpu->mmio_is_write) && run) {
2472                 run->exit_reason = KVM_EXIT_MMIO;
2473                 run->mmio.phys_addr = vcpu->mmio_phys_addr;
2474                 memcpy(run->mmio.data, vcpu->mmio_data, 8);
2475                 run->mmio.len = vcpu->mmio_size;
2476                 run->mmio.is_write = vcpu->mmio_is_write;
2477         }
2478
2479         if (r) {
2480                 if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
2481                         return EMULATE_DONE;
2482                 if (!vcpu->mmio_needed) {
2483                         kvm_report_emulation_failure(vcpu, "mmio");
2484                         return EMULATE_FAIL;
2485                 }
2486                 return EMULATE_DO_MMIO;
2487         }
2488
2489         kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
2490
2491         if (vcpu->mmio_is_write) {
2492                 vcpu->mmio_needed = 0;
2493                 return EMULATE_DO_MMIO;
2494         }
2495
2496         return EMULATE_DONE;
2497 }
2498 EXPORT_SYMBOL_GPL(emulate_instruction);
2499
2500 static int pio_copy_data(struct kvm_vcpu *vcpu)
2501 {
2502         void *p = vcpu->arch.pio_data;
2503         gva_t q = vcpu->arch.pio.guest_gva;
2504         unsigned bytes;
2505         int ret;
2506
2507         bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count;
2508         if (vcpu->arch.pio.in)
2509                 ret = kvm_write_guest_virt(q, p, bytes, vcpu);
2510         else
2511                 ret = kvm_read_guest_virt(q, p, bytes, vcpu);
2512         return ret;
2513 }
2514
2515 int complete_pio(struct kvm_vcpu *vcpu)
2516 {
2517         struct kvm_pio_request *io = &vcpu->arch.pio;
2518         long delta;
2519         int r;
2520         unsigned long val;
2521
2522         if (!io->string) {
2523                 if (io->in) {
2524                         val = kvm_register_read(vcpu, VCPU_REGS_RAX);
2525                         memcpy(&val, vcpu->arch.pio_data, io->size);
2526                         kvm_register_write(vcpu, VCPU_REGS_RAX, val);
2527                 }
2528         } else {
2529                 if (io->in) {
2530                         r = pio_copy_data(vcpu);
2531                         if (r)
2532                                 return r;
2533                 }
2534
2535                 delta = 1;
2536                 if (io->rep) {
2537                         delta *= io->cur_count;
2538                         /*
2539                          * The size of the register should really depend on
2540                          * current address size.
2541                          */
2542                         val = kvm_register_read(vcpu, VCPU_REGS_RCX);
2543                         val -= delta;
2544                         kvm_register_write(vcpu, VCPU_REGS_RCX, val);
2545                 }
2546                 if (io->down)
2547                         delta = -delta;
2548                 delta *= io->size;
2549                 if (io->in) {
2550                         val = kvm_register_read(vcpu, VCPU_REGS_RDI);
2551                         val += delta;
2552                         kvm_register_write(vcpu, VCPU_REGS_RDI, val);
2553                 } else {
2554                         val = kvm_register_read(vcpu, VCPU_REGS_RSI);
2555                         val += delta;
2556                         kvm_register_write(vcpu, VCPU_REGS_RSI, val);
2557                 }
2558         }
2559
2560         io->count -= io->cur_count;
2561         io->cur_count = 0;
2562
2563         return 0;
2564 }
2565
2566 static void kernel_pio(struct kvm_io_device *pio_dev,
2567                        struct kvm_vcpu *vcpu,
2568                        void *pd)
2569 {
2570         /* TODO: String I/O for in kernel device */
2571
2572         mutex_lock(&vcpu->kvm->lock);
2573         if (vcpu->arch.pio.in)
2574                 kvm_iodevice_read(pio_dev, vcpu->arch.pio.port,
2575                                   vcpu->arch.pio.size,
2576                                   pd);
2577         else
2578                 kvm_iodevice_write(pio_dev, vcpu->arch.pio.port,
2579                                    vcpu->arch.pio.size,
2580                                    pd);
2581         mutex_unlock(&vcpu->kvm->lock);
2582 }
2583
2584 static void pio_string_write(struct kvm_io_device *pio_dev,
2585                              struct kvm_vcpu *vcpu)
2586 {
2587         struct kvm_pio_request *io = &vcpu->arch.pio;
2588         void *pd = vcpu->arch.pio_data;
2589         int i;
2590
2591         mutex_lock(&vcpu->kvm->lock);
2592         for (i = 0; i < io->cur_count; i++) {
2593                 kvm_iodevice_write(pio_dev, io->port,
2594                                    io->size,
2595                                    pd);
2596                 pd += io->size;
2597         }
2598         mutex_unlock(&vcpu->kvm->lock);
2599 }
2600
2601 static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu,
2602                                                gpa_t addr, int len,
2603                                                int is_write)
2604 {
2605         return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr, len, is_write);
2606 }
2607
2608 int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
2609                   int size, unsigned port)
2610 {
2611         struct kvm_io_device *pio_dev;
2612         unsigned long val;
2613
2614         vcpu->run->exit_reason = KVM_EXIT_IO;
2615         vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
2616         vcpu->run->io.size = vcpu->arch.pio.size = size;
2617         vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
2618         vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = 1;
2619         vcpu->run->io.port = vcpu->arch.pio.port = port;
2620         vcpu->arch.pio.in = in;
2621         vcpu->arch.pio.string = 0;
2622         vcpu->arch.pio.down = 0;
2623         vcpu->arch.pio.rep = 0;
2624
2625         if (vcpu->run->io.direction == KVM_EXIT_IO_IN)
2626                 KVMTRACE_2D(IO_READ, vcpu, vcpu->run->io.port, (u32)size,
2627                             handler);
2628         else
2629                 KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size,
2630                             handler);
2631
2632         val = kvm_register_read(vcpu, VCPU_REGS_RAX);
2633         memcpy(vcpu->arch.pio_data, &val, 4);
2634
2635         pio_dev = vcpu_find_pio_dev(vcpu, port, size, !in);
2636         if (pio_dev) {
2637                 kernel_pio(pio_dev, vcpu, vcpu->arch.pio_data);
2638                 complete_pio(vcpu);
2639                 return 1;
2640         }
2641         return 0;
2642 }
2643 EXPORT_SYMBOL_GPL(kvm_emulate_pio);
2644
2645 int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
2646                   int size, unsigned long count, int down,
2647                   gva_t address, int rep, unsigned port)
2648 {
2649         unsigned now, in_page;
2650         int ret = 0;
2651         struct kvm_io_device *pio_dev;
2652
2653         vcpu->run->exit_reason = KVM_EXIT_IO;
2654         vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
2655         vcpu->run->io.size = vcpu->arch.pio.size = size;
2656         vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
2657         vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = count;
2658         vcpu->run->io.port = vcpu->arch.pio.port = port;
2659         vcpu->arch.pio.in = in;
2660         vcpu->arch.pio.string = 1;
2661         vcpu->arch.pio.down = down;
2662         vcpu->arch.pio.rep = rep;
2663
2664         if (vcpu->run->io.direction == KVM_EXIT_IO_IN)
2665                 KVMTRACE_2D(IO_READ, vcpu, vcpu->run->io.port, (u32)size,
2666                             handler);
2667         else
2668                 KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size,
2669                             handler);
2670
2671         if (!count) {
2672                 kvm_x86_ops->skip_emulated_instruction(vcpu);
2673                 return 1;
2674         }
2675
2676         if (!down)
2677                 in_page = PAGE_SIZE - offset_in_page(address);
2678         else
2679                 in_page = offset_in_page(address) + size;
2680         now = min(count, (unsigned long)in_page / size);
2681         if (!now)
2682                 now = 1;
2683         if (down) {
2684                 /*
2685                  * String I/O in reverse.  Yuck.  Kill the guest, fix later.
2686                  */
2687                 pr_unimpl(vcpu, "guest string pio down\n");
2688                 kvm_inject_gp(vcpu, 0);
2689                 return 1;
2690         }
2691         vcpu->run->io.count = now;
2692         vcpu->arch.pio.cur_count = now;
2693
2694         if (vcpu->arch.pio.cur_count == vcpu->arch.pio.count)
2695                 kvm_x86_ops->skip_emulated_instruction(vcpu);
2696
2697         vcpu->arch.pio.guest_gva = address;
2698
2699         pio_dev = vcpu_find_pio_dev(vcpu, port,
2700                                     vcpu->arch.pio.cur_count,
2701                                     !vcpu->arch.pio.in);
2702         if (!vcpu->arch.pio.in) {
2703                 /* string PIO write */
2704                 ret = pio_copy_data(vcpu);
2705                 if (ret == X86EMUL_PROPAGATE_FAULT) {
2706                         kvm_inject_gp(vcpu, 0);
2707                         return 1;
2708                 }
2709                 if (ret == 0 && pio_dev) {
2710                         pio_string_write(pio_dev, vcpu);
2711                         complete_pio(vcpu);
2712                         if (vcpu->arch.pio.count == 0)
2713                                 ret = 1;
2714                 }
2715         } else if (pio_dev)
2716                 pr_unimpl(vcpu, "no string pio read support yet, "
2717                        "port %x size %d count %ld\n",
2718                         port, size, count);
2719
2720         return ret;
2721 }
2722 EXPORT_SYMBOL_GPL(kvm_emulate_pio_string);
2723
2724 static void bounce_off(void *info)
2725 {
2726         /* nothing */
2727 }
2728
2729 static unsigned int  ref_freq;
2730 static unsigned long tsc_khz_ref;
2731
2732 static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
2733                                      void *data)
2734 {
2735         struct cpufreq_freqs *freq = data;
2736         struct kvm *kvm;
2737         struct kvm_vcpu *vcpu;
2738         int i, send_ipi = 0;
2739
2740         if (!ref_freq)
2741                 ref_freq = freq->old;
2742
2743         if (val == CPUFREQ_PRECHANGE && freq->old > freq->new)
2744                 return 0;
2745         if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new)
2746                 return 0;
2747         per_cpu(cpu_tsc_khz, freq->cpu) = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new);
2748
2749         spin_lock(&kvm_lock);
2750         list_for_each_entry(kvm, &vm_list, vm_list) {
2751                 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
2752                         vcpu = kvm->vcpus[i];
2753                         if (!vcpu)
2754                                 continue;
2755                         if (vcpu->cpu != freq->cpu)
2756                                 continue;
2757                         if (!kvm_request_guest_time_update(vcpu))
2758                                 continue;
2759                         if (vcpu->cpu != smp_processor_id())
2760                                 send_ipi++;
2761                 }
2762         }
2763         spin_unlock(&kvm_lock);
2764
2765         if (freq->old < freq->new && send_ipi) {
2766                 /*
2767                  * We upscale the frequency.  Must make the guest
2768                  * doesn't see old kvmclock values while running with
2769                  * the new frequency, otherwise we risk the guest sees
2770                  * time go backwards.
2771                  *
2772                  * In case we update the frequency for another cpu
2773                  * (which might be in guest context) send an interrupt
2774                  * to kick the cpu out of guest context.  Next time
2775                  * guest context is entered kvmclock will be updated,
2776                  * so the guest will not see stale values.
2777                  */
2778                 smp_call_function_single(freq->cpu, bounce_off, NULL, 1);
2779         }
2780         return 0;
2781 }
2782
2783 static struct notifier_block kvmclock_cpufreq_notifier_block = {
2784         .notifier_call  = kvmclock_cpufreq_notifier
2785 };
2786
2787 int kvm_arch_init(void *opaque)
2788 {
2789         int r, cpu;
2790         struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque;
2791
2792         if (kvm_x86_ops) {
2793                 printk(KERN_ERR "kvm: already loaded the other module\n");
2794                 r = -EEXIST;
2795                 goto out;
2796         }
2797
2798         if (!ops->cpu_has_kvm_support()) {
2799                 printk(KERN_ERR "kvm: no hardware support\n");
2800                 r = -EOPNOTSUPP;
2801                 goto out;
2802         }
2803         if (ops->disabled_by_bios()) {
2804                 printk(KERN_ERR "kvm: disabled by bios\n");
2805                 r = -EOPNOTSUPP;
2806                 goto out;
2807         }
2808
2809         r = kvm_mmu_module_init();
2810         if (r)
2811                 goto out;
2812
2813         kvm_init_msr_list();
2814
2815         kvm_x86_ops = ops;
2816         kvm_mmu_set_nonpresent_ptes(0ull, 0ull);
2817         kvm_mmu_set_base_ptes(PT_PRESENT_MASK);
2818         kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
2819                         PT_DIRTY_MASK, PT64_NX_MASK, 0, 0);
2820
2821         for_each_possible_cpu(cpu)
2822                 per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
2823         if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
2824                 tsc_khz_ref = tsc_khz;
2825                 cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
2826                                           CPUFREQ_TRANSITION_NOTIFIER);
2827         }
2828
2829         return 0;
2830
2831 out:
2832         return r;
2833 }
2834
2835 void kvm_arch_exit(void)
2836 {
2837         if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
2838                 cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block,
2839                                             CPUFREQ_TRANSITION_NOTIFIER);
2840         kvm_x86_ops = NULL;
2841         kvm_mmu_module_exit();
2842 }
2843
2844 int kvm_emulate_halt(struct kvm_vcpu *vcpu)
2845 {
2846         ++vcpu->stat.halt_exits;
2847         KVMTRACE_0D(HLT, vcpu, handler);
2848         if (irqchip_in_kernel(vcpu->kvm)) {
2849                 vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
2850                 return 1;
2851         } else {
2852                 vcpu->run->exit_reason = KVM_EXIT_HLT;
2853                 return 0;
2854         }
2855 }
2856 EXPORT_SYMBOL_GPL(kvm_emulate_halt);
2857
2858 static inline gpa_t hc_gpa(struct kvm_vcpu *vcpu, unsigned long a0,
2859                            unsigned long a1)
2860 {
2861         if (is_long_mode(vcpu))
2862                 return a0;
2863         else
2864                 return a0 | ((gpa_t)a1 << 32);
2865 }
2866
2867 int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
2868 {
2869         unsigned long nr, a0, a1, a2, a3, ret;
2870         int r = 1;
2871
2872         nr = kvm_register_read(vcpu, VCPU_REGS_RAX);
2873         a0 = kvm_register_read(vcpu, VCPU_REGS_RBX);
2874         a1 = kvm_register_read(vcpu, VCPU_REGS_RCX);
2875         a2 = kvm_register_read(vcpu, VCPU_REGS_RDX);
2876         a3 = kvm_register_read(vcpu, VCPU_REGS_RSI);
2877
2878         KVMTRACE_1D(VMMCALL, vcpu, (u32)nr, handler);
2879
2880         if (!is_long_mode(vcpu)) {
2881                 nr &= 0xFFFFFFFF;
2882                 a0 &= 0xFFFFFFFF;
2883                 a1 &= 0xFFFFFFFF;
2884                 a2 &= 0xFFFFFFFF;
2885                 a3 &= 0xFFFFFFFF;
2886         }
2887
2888         switch (nr) {
2889         case KVM_HC_VAPIC_POLL_IRQ:
2890                 ret = 0;
2891                 break;
2892         case KVM_HC_MMU_OP:
2893                 r = kvm_pv_mmu_op(vcpu, a0, hc_gpa(vcpu, a1, a2), &ret);
2894                 break;
2895         default:
2896                 ret = -KVM_ENOSYS;
2897                 break;
2898         }
2899         kvm_register_write(vcpu, VCPU_REGS_RAX, ret);
2900         ++vcpu->stat.hypercalls;
2901         return r;
2902 }
2903 EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
2904
2905 int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
2906 {
2907         char instruction[3];
2908         int ret = 0;
2909         unsigned long rip = kvm_rip_read(vcpu);
2910
2911
2912         /*
2913          * Blow out the MMU to ensure that no other VCPU has an active mapping
2914          * to ensure that the updated hypercall appears atomically across all
2915          * VCPUs.
2916          */
2917         kvm_mmu_zap_all(vcpu->kvm);
2918
2919         kvm_x86_ops->patch_hypercall(vcpu, instruction);
2920         if (emulator_write_emulated(rip, instruction, 3, vcpu)
2921             != X86EMUL_CONTINUE)
2922                 ret = -EFAULT;
2923
2924         return ret;
2925 }
2926
2927 static u64 mk_cr_64(u64 curr_cr, u32 new_val)
2928 {
2929         return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
2930 }
2931
2932 void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
2933 {
2934         struct descriptor_table dt = { limit, base };
2935
2936         kvm_x86_ops->set_gdt(vcpu, &dt);
2937 }
2938
2939 void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
2940 {
2941         struct descriptor_table dt = { limit, base };
2942
2943         kvm_x86_ops->set_idt(vcpu, &dt);
2944 }
2945
2946 void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
2947                    unsigned long *rflags)
2948 {
2949         kvm_lmsw(vcpu, msw);
2950         *rflags = kvm_x86_ops->get_rflags(vcpu);
2951 }
2952
2953 unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
2954 {
2955         unsigned long value;
2956
2957         kvm_x86_ops->decache_cr4_guest_bits(vcpu);
2958         switch (cr) {
2959         case 0:
2960                 value = vcpu->arch.cr0;
2961                 break;
2962         case 2:
2963                 value = vcpu->arch.cr2;
2964                 break;
2965         case 3:
2966                 value = vcpu->arch.cr3;
2967                 break;
2968         case 4:
2969                 value = vcpu->arch.cr4;
2970                 break;
2971         case 8:
2972                 value = kvm_get_cr8(vcpu);
2973                 break;
2974         default:
2975                 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
2976                 return 0;
2977         }
2978         KVMTRACE_3D(CR_READ, vcpu, (u32)cr, (u32)value,
2979                     (u32)((u64)value >> 32), handler);
2980
2981         return value;
2982 }
2983
2984 void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
2985                      unsigned long *rflags)
2986 {
2987         KVMTRACE_3D(CR_WRITE, vcpu, (u32)cr, (u32)val,
2988                     (u32)((u64)val >> 32), handler);
2989
2990         switch (cr) {
2991         case 0:
2992                 kvm_set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val));
2993                 *rflags = kvm_x86_ops->get_rflags(vcpu);
2994                 break;
2995         case 2:
2996                 vcpu->arch.cr2 = val;
2997                 break;
2998         case 3:
2999                 kvm_set_cr3(vcpu, val);
3000                 break;
3001         case 4:
3002                 kvm_set_cr4(vcpu, mk_cr_64(vcpu->arch.cr4, val));
3003                 break;
3004         case 8:
3005                 kvm_set_cr8(vcpu, val & 0xfUL);
3006                 break;
3007         default:
3008                 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
3009         }
3010 }
3011
3012 static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i)
3013 {
3014         struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i];
3015         int j, nent = vcpu->arch.cpuid_nent;
3016
3017         e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT;
3018         /* when no next entry is found, the current entry[i] is reselected */
3019         for (j = i + 1; ; j = (j + 1) % nent) {
3020                 struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j];
3021                 if (ej->function == e->function) {
3022                         ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
3023                         return j;
3024                 }
3025         }
3026         return 0; /* silence gcc, even though control never reaches here */
3027 }
3028
3029 /* find an entry with matching function, matching index (if needed), and that
3030  * should be read next (if it's stateful) */
3031 static int is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e,
3032         u32 function, u32 index)
3033 {
3034         if (e->function != function)
3035                 return 0;
3036         if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index)
3037                 return 0;
3038         if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) &&
3039             !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT))
3040                 return 0;
3041         return 1;
3042 }
3043
3044 struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
3045                                               u32 function, u32 index)
3046 {
3047         int i;
3048         struct kvm_cpuid_entry2 *best = NULL;
3049
3050         for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
3051                 struct kvm_cpuid_entry2 *e;
3052
3053                 e = &vcpu->arch.cpuid_entries[i];
3054                 if (is_matching_cpuid_entry(e, function, index)) {
3055                         if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC)
3056                                 move_to_next_stateful_cpuid_entry(vcpu, i);
3057                         best = e;
3058                         break;
3059                 }
3060                 /*
3061                  * Both basic or both extended?
3062                  */
3063                 if (((e->function ^ function) & 0x80000000) == 0)
3064                         if (!best || e->function > best->function)
3065                                 best = e;
3066         }
3067         return best;
3068 }
3069
3070 void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
3071 {
3072         u32 function, index;
3073         struct kvm_cpuid_entry2 *best;
3074
3075         function = kvm_register_read(vcpu, VCPU_REGS_RAX);
3076         index = kvm_register_read(vcpu, VCPU_REGS_RCX);
3077         kvm_register_write(vcpu, VCPU_REGS_RAX, 0);
3078         kvm_register_write(vcpu, VCPU_REGS_RBX, 0);
3079         kvm_register_write(vcpu, VCPU_REGS_RCX, 0);
3080         kvm_register_write(vcpu, VCPU_REGS_RDX, 0);
3081         best = kvm_find_cpuid_entry(vcpu, function, index);
3082         if (best) {
3083                 kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax);
3084                 kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx);
3085                 kvm_register_write(vcpu, VCPU_REGS_RCX, best->ecx);
3086                 kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx);
3087         }
3088         kvm_x86_ops->skip_emulated_instruction(vcpu);
3089         KVMTRACE_5D(CPUID, vcpu, function,
3090                     (u32)kvm_register_read(vcpu, VCPU_REGS_RAX),
3091                     (u32)kvm_register_read(vcpu, VCPU_REGS_RBX),
3092                     (u32)kvm_register_read(vcpu, VCPU_REGS_RCX),
3093                     (u32)kvm_register_read(vcpu, VCPU_REGS_RDX), handler);
3094 }
3095 EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
3096
3097 /*
3098  * Check if userspace requested an interrupt window, and that the
3099  * interrupt window is open.
3100  *
3101  * No need to exit to userspace if we already have an interrupt queued.
3102  */
3103 static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu,
3104                                           struct kvm_run *kvm_run)
3105 {
3106         return (!vcpu->arch.irq_summary &&
3107                 kvm_run->request_interrupt_window &&
3108                 vcpu->arch.interrupt_window_open &&
3109                 (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF));
3110 }
3111
3112 static void post_kvm_run_save(struct kvm_vcpu *vcpu,
3113                               struct kvm_run *kvm_run)
3114 {
3115         kvm_run->if_flag = (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
3116         kvm_run->cr8 = kvm_get_cr8(vcpu);
3117         kvm_run->apic_base = kvm_get_apic_base(vcpu);
3118         if (irqchip_in_kernel(vcpu->kvm))
3119                 kvm_run->ready_for_interrupt_injection = 1;
3120         else
3121                 kvm_run->ready_for_interrupt_injection =
3122                                         (vcpu->arch.interrupt_window_open &&
3123                                          vcpu->arch.irq_summary == 0);
3124 }
3125
3126 static void vapic_enter(struct kvm_vcpu *vcpu)
3127 {
3128         struct kvm_lapic *apic = vcpu->arch.apic;
3129         struct page *page;
3130
3131         if (!apic || !apic->vapic_addr)
3132                 return;
3133
3134         page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
3135
3136         vcpu->arch.apic->vapic_page = page;
3137 }
3138
3139 static void vapic_exit(struct kvm_vcpu *vcpu)
3140 {
3141         struct kvm_lapic *apic = vcpu->arch.apic;
3142
3143         if (!apic || !apic->vapic_addr)
3144                 return;
3145
3146         down_read(&vcpu->kvm->slots_lock);
3147         kvm_release_page_dirty(apic->vapic_page);
3148         mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
3149         up_read(&vcpu->kvm->slots_lock);
3150 }
3151
3152 static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3153 {
3154         int r;
3155
3156         if (vcpu->requests)
3157                 if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests))
3158                         kvm_mmu_unload(vcpu);
3159
3160         r = kvm_mmu_reload(vcpu);
3161         if (unlikely(r))
3162                 goto out;
3163
3164         if (vcpu->requests) {
3165                 if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests))
3166                         __kvm_migrate_timers(vcpu);
3167                 if (test_and_clear_bit(KVM_REQ_KVMCLOCK_UPDATE, &vcpu->requests))
3168                         kvm_write_guest_time(vcpu);
3169                 if (test_and_clear_bit(KVM_REQ_MMU_SYNC, &vcpu->requests))
3170                         kvm_mmu_sync_roots(vcpu);
3171                 if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
3172                         kvm_x86_ops->tlb_flush(vcpu);
3173                 if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS,
3174                                        &vcpu->requests)) {
3175                         kvm_run->exit_reason = KVM_EXIT_TPR_ACCESS;
3176                         r = 0;
3177                         goto out;
3178                 }
3179                 if (test_and_clear_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests)) {
3180                         kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
3181                         r = 0;
3182                         goto out;
3183                 }
3184         }
3185
3186         clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);
3187         kvm_inject_pending_timer_irqs(vcpu);
3188
3189         preempt_disable();
3190
3191         kvm_x86_ops->prepare_guest_switch(vcpu);
3192         kvm_load_guest_fpu(vcpu);
3193
3194         local_irq_disable();
3195
3196         if (vcpu->requests || need_resched() || signal_pending(current)) {
3197                 local_irq_enable();
3198                 preempt_enable();
3199                 r = 1;
3200                 goto out;
3201         }
3202
3203         vcpu->guest_mode = 1;
3204         /*
3205          * Make sure that guest_mode assignment won't happen after
3206          * testing the pending IRQ vector bitmap.
3207          */
3208         smp_wmb();
3209
3210         if (vcpu->arch.exception.pending)
3211                 __queue_exception(vcpu);
3212         else if (irqchip_in_kernel(vcpu->kvm))
3213                 kvm_x86_ops->inject_pending_irq(vcpu);
3214         else
3215                 kvm_x86_ops->inject_pending_vectors(vcpu, kvm_run);
3216
3217         kvm_lapic_sync_to_vapic(vcpu);
3218
3219         up_read(&vcpu->kvm->slots_lock);
3220
3221         kvm_guest_enter();
3222
3223         get_debugreg(vcpu->arch.host_dr6, 6);
3224         get_debugreg(vcpu->arch.host_dr7, 7);
3225         if (unlikely(vcpu->arch.switch_db_regs)) {
3226                 get_debugreg(vcpu->arch.host_db[0], 0);
3227                 get_debugreg(vcpu->arch.host_db[1], 1);
3228                 get_debugreg(vcpu->arch.host_db[2], 2);
3229                 get_debugreg(vcpu->arch.host_db[3], 3);
3230
3231                 set_debugreg(0, 7);
3232                 set_debugreg(vcpu->arch.eff_db[0], 0);
3233                 set_debugreg(vcpu->arch.eff_db[1], 1);
3234                 set_debugreg(vcpu->arch.eff_db[2], 2);
3235                 set_debugreg(vcpu->arch.eff_db[3], 3);
3236         }
3237
3238         KVMTRACE_0D(VMENTRY, vcpu, entryexit);
3239         kvm_x86_ops->run(vcpu, kvm_run);
3240
3241         if (unlikely(vcpu->arch.switch_db_regs)) {
3242                 set_debugreg(0, 7);
3243                 set_debugreg(vcpu->arch.host_db[0], 0);
3244                 set_debugreg(vcpu->arch.host_db[1], 1);
3245                 set_debugreg(vcpu->arch.host_db[2], 2);
3246                 set_debugreg(vcpu->arch.host_db[3], 3);
3247         }
3248         set_debugreg(vcpu->arch.host_dr6, 6);
3249         set_debugreg(vcpu->arch.host_dr7, 7);
3250
3251         vcpu->guest_mode = 0;
3252         local_irq_enable();
3253
3254         ++vcpu->stat.exits;
3255
3256         /*
3257          * We must have an instruction between local_irq_enable() and
3258          * kvm_guest_exit(), so the timer interrupt isn't delayed by
3259          * the interrupt shadow.  The stat.exits increment will do nicely.
3260          * But we need to prevent reordering, hence this barrier():
3261          */
3262         barrier();
3263
3264         kvm_guest_exit();
3265
3266         preempt_enable();
3267
3268         down_read(&vcpu->kvm->slots_lock);
3269
3270         /*
3271          * Profile KVM exit RIPs:
3272          */
3273         if (unlikely(prof_on == KVM_PROFILING)) {
3274                 unsigned long rip = kvm_rip_read(vcpu);
3275                 profile_hit(KVM_PROFILING, (void *)rip);
3276         }
3277
3278         if (vcpu->arch.exception.pending && kvm_x86_ops->exception_injected(vcpu))
3279                 vcpu->arch.exception.pending = false;
3280
3281         kvm_lapic_sync_from_vapic(vcpu);
3282
3283         r = kvm_x86_ops->handle_exit(kvm_run, vcpu);
3284 out:
3285         return r;
3286 }
3287
3288 static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3289 {
3290         int r;
3291
3292         if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) {
3293                 pr_debug("vcpu %d received sipi with vector # %x\n",
3294                          vcpu->vcpu_id, vcpu->arch.sipi_vector);
3295                 kvm_lapic_reset(vcpu);
3296                 r = kvm_arch_vcpu_reset(vcpu);
3297                 if (r)
3298                         return r;
3299                 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
3300         }
3301
3302         down_read(&vcpu->kvm->slots_lock);
3303         vapic_enter(vcpu);
3304
3305         r = 1;
3306         while (r > 0) {
3307                 if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE)
3308                         r = vcpu_enter_guest(vcpu, kvm_run);
3309                 else {
3310                         up_read(&vcpu->kvm->slots_lock);
3311                         kvm_vcpu_block(vcpu);
3312                         down_read(&vcpu->kvm->slots_lock);
3313                         if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests))
3314                                 if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED)
3315                                         vcpu->arch.mp_state =
3316                                                         KVM_MP_STATE_RUNNABLE;
3317                         if (vcpu->arch.mp_state != KVM_MP_STATE_RUNNABLE)
3318                                 r = -EINTR;
3319                 }
3320
3321                 if (r > 0) {
3322                         if (dm_request_for_irq_injection(vcpu, kvm_run)) {
3323                                 r = -EINTR;
3324                                 kvm_run->exit_reason = KVM_EXIT_INTR;
3325                                 ++vcpu->stat.request_irq_exits;
3326                         }
3327                         if (signal_pending(current)) {
3328                                 r = -EINTR;
3329                                 kvm_run->exit_reason = KVM_EXIT_INTR;
3330                                 ++vcpu->stat.signal_exits;
3331                         }
3332                         if (need_resched()) {
3333                                 up_read(&vcpu->kvm->slots_lock);
3334                                 kvm_resched(vcpu);
3335                                 down_read(&vcpu->kvm->slots_lock);
3336                         }
3337                 }
3338         }
3339
3340         up_read(&vcpu->kvm->slots_lock);
3341         post_kvm_run_save(vcpu, kvm_run);
3342
3343         vapic_exit(vcpu);
3344
3345         return r;
3346 }
3347
3348 int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3349 {
3350         int r;
3351         sigset_t sigsaved;
3352
3353         vcpu_load(vcpu);
3354
3355         if (vcpu->sigset_active)
3356                 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
3357
3358         if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
3359                 kvm_vcpu_block(vcpu);
3360                 clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
3361                 r = -EAGAIN;
3362                 goto out;
3363         }
3364
3365         /* re-sync apic's tpr */
3366         if (!irqchip_in_kernel(vcpu->kvm))
3367                 kvm_set_cr8(vcpu, kvm_run->cr8);
3368
3369         if (vcpu->arch.pio.cur_count) {
3370                 r = complete_pio(vcpu);
3371                 if (r)
3372                         goto out;
3373         }
3374 #if CONFIG_HAS_IOMEM
3375         if (vcpu->mmio_needed) {
3376                 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
3377                 vcpu->mmio_read_completed = 1;
3378                 vcpu->mmio_needed = 0;
3379
3380                 down_read(&vcpu->kvm->slots_lock);
3381                 r = emulate_instruction(vcpu, kvm_run,
3382                                         vcpu->arch.mmio_fault_cr2, 0,
3383                                         EMULTYPE_NO_DECODE);
3384                 up_read(&vcpu->kvm->slots_lock);
3385                 if (r == EMULATE_DO_MMIO) {
3386                         /*
3387                          * Read-modify-write.  Back to userspace.
3388                          */
3389                         r = 0;
3390                         goto out;
3391                 }
3392         }
3393 #endif
3394         if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL)
3395                 kvm_register_write(vcpu, VCPU_REGS_RAX,
3396                                      kvm_run->hypercall.ret);
3397
3398         r = __vcpu_run(vcpu, kvm_run);
3399
3400 out:
3401         if (vcpu->sigset_active)
3402                 sigprocmask(SIG_SETMASK, &sigsaved, NULL);
3403
3404         vcpu_put(vcpu);
3405         return r;
3406 }
3407
3408 int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
3409 {
3410         vcpu_load(vcpu);
3411
3412         regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX);
3413         regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX);
3414         regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX);
3415         regs->rdx = kvm_register_read(vcpu, VCPU_REGS_RDX);
3416         regs->rsi = kvm_register_read(vcpu, VCPU_REGS_RSI);
3417         regs->rdi = kvm_register_read(vcpu, VCPU_REGS_RDI);
3418         regs->rsp = kvm_register_read(vcpu, VCPU_REGS_RSP);
3419         regs->rbp = kvm_register_read(vcpu, VCPU_REGS_RBP);
3420 #ifdef CONFIG_X86_64
3421         regs->r8 = kvm_register_read(vcpu, VCPU_REGS_R8);
3422         regs->r9 = kvm_register_read(vcpu, VCPU_REGS_R9);
3423         regs->r10 = kvm_register_read(vcpu, VCPU_REGS_R10);
3424         regs->r11 = kvm_register_read(vcpu, VCPU_REGS_R11);
3425         regs->r12 = kvm_register_read(vcpu, VCPU_REGS_R12);
3426         regs->r13 = kvm_register_read(vcpu, VCPU_REGS_R13);
3427         regs->r14 = kvm_register_read(vcpu, VCPU_REGS_R14);
3428         regs->r15 = kvm_register_read(vcpu, VCPU_REGS_R15);
3429 #endif
3430
3431         regs->rip = kvm_rip_read(vcpu);
3432         regs->rflags = kvm_x86_ops->get_rflags(vcpu);
3433
3434         /*
3435          * Don't leak debug flags in case they were set for guest debugging
3436          */
3437         if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
3438                 regs->rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
3439
3440         vcpu_put(vcpu);
3441
3442         return 0;
3443 }
3444
3445 int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
3446 {
3447         vcpu_load(vcpu);
3448
3449         kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax);
3450         kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx);
3451         kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx);
3452         kvm_register_write(vcpu, VCPU_REGS_RDX, regs->rdx);
3453         kvm_register_write(vcpu, VCPU_REGS_RSI, regs->rsi);
3454         kvm_register_write(vcpu, VCPU_REGS_RDI, regs->rdi);
3455         kvm_register_write(vcpu, VCPU_REGS_RSP, regs->rsp);
3456         kvm_register_write(vcpu, VCPU_REGS_RBP, regs->rbp);
3457 #ifdef CONFIG_X86_64
3458         kvm_register_write(vcpu, VCPU_REGS_R8, regs->r8);
3459         kvm_register_write(vcpu, VCPU_REGS_R9, regs->r9);
3460         kvm_register_write(vcpu, VCPU_REGS_R10, regs->r10);
3461         kvm_register_write(vcpu, VCPU_REGS_R11, regs->r11);
3462         kvm_register_write(vcpu, VCPU_REGS_R12, regs->r12);
3463         kvm_register_write(vcpu, VCPU_REGS_R13, regs->r13);
3464         kvm_register_write(vcpu, VCPU_REGS_R14, regs->r14);
3465         kvm_register_write(vcpu, VCPU_REGS_R15, regs->r15);
3466
3467 #endif
3468
3469         kvm_rip_write(vcpu, regs->rip);
3470         kvm_x86_ops->set_rflags(vcpu, regs->rflags);
3471
3472
3473         vcpu->arch.exception.pending = false;
3474
3475         vcpu_put(vcpu);
3476
3477         return 0;
3478 }
3479
3480 void kvm_get_segment(struct kvm_vcpu *vcpu,
3481                      struct kvm_segment *var, int seg)
3482 {
3483         kvm_x86_ops->get_segment(vcpu, var, seg);
3484 }
3485
3486 void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
3487 {
3488         struct kvm_segment cs;
3489
3490         kvm_get_segment(vcpu, &cs, VCPU_SREG_CS);
3491         *db = cs.db;
3492         *l = cs.l;
3493 }
3494 EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits);
3495
3496 int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
3497                                   struct kvm_sregs *sregs)
3498 {
3499         struct descriptor_table dt;
3500         int pending_vec;
3501
3502         vcpu_load(vcpu);
3503
3504         kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
3505         kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
3506         kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
3507         kvm_get_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
3508         kvm_get_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
3509         kvm_get_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
3510
3511         kvm_get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
3512         kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
3513
3514         kvm_x86_ops->get_idt(vcpu, &dt);
3515         sregs->idt.limit = dt.limit;
3516         sregs->idt.base = dt.base;
3517         kvm_x86_ops->get_gdt(vcpu, &dt);
3518         sregs->gdt.limit = dt.limit;
3519         sregs->gdt.base = dt.base;
3520
3521         kvm_x86_ops->decache_cr4_guest_bits(vcpu);
3522         sregs->cr0 = vcpu->arch.cr0;
3523         sregs->cr2 = vcpu->arch.cr2;
3524         sregs->cr3 = vcpu->arch.cr3;
3525         sregs->cr4 = vcpu->arch.cr4;
3526         sregs->cr8 = kvm_get_cr8(vcpu);
3527         sregs->efer = vcpu->arch.shadow_efer;
3528         sregs->apic_base = kvm_get_apic_base(vcpu);
3529
3530         if (irqchip_in_kernel(vcpu->kvm)) {
3531                 memset(sregs->interrupt_bitmap, 0,
3532                        sizeof sregs->interrupt_bitmap);
3533                 pending_vec = kvm_x86_ops->get_irq(vcpu);
3534                 if (pending_vec >= 0)
3535                         set_bit(pending_vec,
3536                                 (unsigned long *)sregs->interrupt_bitmap);
3537         } else
3538                 memcpy(sregs->interrupt_bitmap, vcpu->arch.irq_pending,
3539                        sizeof sregs->interrupt_bitmap);
3540
3541         vcpu_put(vcpu);
3542
3543         return 0;
3544 }
3545
3546 int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
3547                                     struct kvm_mp_state *mp_state)
3548 {
3549         vcpu_load(vcpu);
3550         mp_state->mp_state = vcpu->arch.mp_state;
3551         vcpu_put(vcpu);
3552         return 0;
3553 }
3554
3555 int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
3556                                     struct kvm_mp_state *mp_state)
3557 {
3558         vcpu_load(vcpu);
3559         vcpu->arch.mp_state = mp_state->mp_state;
3560         vcpu_put(vcpu);
3561         return 0;
3562 }
3563
3564 static void kvm_set_segment(struct kvm_vcpu *vcpu,
3565                         struct kvm_segment *var, int seg)
3566 {
3567         kvm_x86_ops->set_segment(vcpu, var, seg);
3568 }
3569
3570 static void seg_desct_to_kvm_desct(struct desc_struct *seg_desc, u16 selector,
3571                                    struct kvm_segment *kvm_desct)
3572 {
3573         kvm_desct->base = seg_desc->base0;
3574         kvm_desct->base |= seg_desc->base1 << 16;
3575         kvm_desct->base |= seg_desc->base2 << 24;
3576         kvm_desct->limit = seg_desc->limit0;
3577         kvm_desct->limit |= seg_desc->limit << 16;
3578         if (seg_desc->g) {
3579                 kvm_desct->limit <<= 12;
3580                 kvm_desct->limit |= 0xfff;
3581         }
3582         kvm_desct->selector = selector;
3583         kvm_desct->type = seg_desc->type;
3584         kvm_desct->present = seg_desc->p;
3585         kvm_desct->dpl = seg_desc->dpl;
3586         kvm_desct->db = seg_desc->d;
3587         kvm_desct->s = seg_desc->s;
3588         kvm_desct->l = seg_desc->l;
3589         kvm_desct->g = seg_desc->g;
3590         kvm_desct->avl = seg_desc->avl;
3591         if (!selector)
3592                 kvm_desct->unusable = 1;
3593         else
3594                 kvm_desct->unusable = 0;
3595         kvm_desct->padding = 0;
3596 }
3597
3598 static void get_segment_descriptor_dtable(struct kvm_vcpu *vcpu,
3599                                           u16 selector,
3600                                           struct descriptor_table *dtable)
3601 {
3602         if (selector & 1 << 2) {
3603                 struct kvm_segment kvm_seg;
3604
3605                 kvm_get_segment(vcpu, &kvm_seg, VCPU_SREG_LDTR);
3606
3607                 if (kvm_seg.unusable)
3608                         dtable->limit = 0;
3609                 else
3610                         dtable->limit = kvm_seg.limit;
3611                 dtable->base = kvm_seg.base;
3612         }
3613         else
3614                 kvm_x86_ops->get_gdt(vcpu, dtable);
3615 }
3616
3617 /* allowed just for 8 bytes segments */
3618 static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
3619                                          struct desc_struct *seg_desc)
3620 {
3621         gpa_t gpa;
3622         struct descriptor_table dtable;
3623         u16 index = selector >> 3;
3624
3625         get_segment_descriptor_dtable(vcpu, selector, &dtable);
3626
3627         if (dtable.limit < index * 8 + 7) {
3628                 kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc);
3629                 return 1;
3630         }
3631         gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, dtable.base);
3632         gpa += index * 8;
3633         return kvm_read_guest(vcpu->kvm, gpa, seg_desc, 8);
3634 }
3635
3636 /* allowed just for 8 bytes segments */
3637 static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
3638                                          struct desc_struct *seg_desc)
3639 {
3640         gpa_t gpa;
3641         struct descriptor_table dtable;
3642         u16 index = selector >> 3;
3643
3644         get_segment_descriptor_dtable(vcpu, selector, &dtable);
3645
3646         if (dtable.limit < index * 8 + 7)
3647                 return 1;
3648         gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, dtable.base);
3649         gpa += index * 8;
3650         return kvm_write_guest(vcpu->kvm, gpa, seg_desc, 8);
3651 }
3652
3653 static u32 get_tss_base_addr(struct kvm_vcpu *vcpu,
3654                              struct desc_struct *seg_desc)
3655 {
3656         u32 base_addr;
3657
3658         base_addr = seg_desc->base0;
3659         base_addr |= (seg_desc->base1 << 16);
3660         base_addr |= (seg_desc->base2 << 24);
3661
3662         return vcpu->arch.mmu.gva_to_gpa(vcpu, base_addr);
3663 }
3664
3665 static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg)
3666 {
3667         struct kvm_segment kvm_seg;
3668
3669         kvm_get_segment(vcpu, &kvm_seg, seg);
3670         return kvm_seg.selector;
3671 }
3672
3673 static int load_segment_descriptor_to_kvm_desct(struct kvm_vcpu *vcpu,
3674                                                 u16 selector,
3675                                                 struct kvm_segment *kvm_seg)
3676 {
3677         struct desc_struct seg_desc;
3678
3679         if (load_guest_segment_descriptor(vcpu, selector, &seg_desc))
3680                 return 1;
3681         seg_desct_to_kvm_desct(&seg_desc, selector, kvm_seg);
3682         return 0;
3683 }
3684
3685 static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int seg)
3686 {
3687         struct kvm_segment segvar = {
3688                 .base = selector << 4,
3689                 .limit = 0xffff,
3690                 .selector = selector,
3691                 .type = 3,
3692                 .present = 1,
3693                 .dpl = 3,
3694                 .db = 0,
3695                 .s = 1,
3696                 .l = 0,
3697                 .g = 0,
3698                 .avl = 0,
3699                 .unusable = 0,
3700         };
3701         kvm_x86_ops->set_segment(vcpu, &segvar, seg);
3702         return 0;
3703 }
3704
3705 int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
3706                                 int type_bits, int seg)
3707 {
3708         struct kvm_segment kvm_seg;
3709
3710         if (!(vcpu->arch.cr0 & X86_CR0_PE))
3711                 return kvm_load_realmode_segment(vcpu, selector, seg);
3712         if (load_segment_descriptor_to_kvm_desct(vcpu, selector, &kvm_seg))
3713                 return 1;
3714         kvm_seg.type |= type_bits;
3715
3716         if (seg != VCPU_SREG_SS && seg != VCPU_SREG_CS &&
3717             seg != VCPU_SREG_LDTR)
3718                 if (!kvm_seg.s)
3719                         kvm_seg.unusable = 1;
3720
3721         kvm_set_segment(vcpu, &kvm_seg, seg);
3722         return 0;
3723 }
3724
3725 static void save_state_to_tss32(struct kvm_vcpu *vcpu,
3726                                 struct tss_segment_32 *tss)
3727 {
3728         tss->cr3 = vcpu->arch.cr3;
3729         tss->eip = kvm_rip_read(vcpu);
3730         tss->eflags = kvm_x86_ops->get_rflags(vcpu);
3731         tss->eax = kvm_register_read(vcpu, VCPU_REGS_RAX);
3732         tss->ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
3733         tss->edx = kvm_register_read(vcpu, VCPU_REGS_RDX);
3734         tss->ebx = kvm_register_read(vcpu, VCPU_REGS_RBX);
3735         tss->esp = kvm_register_read(vcpu, VCPU_REGS_RSP);
3736         tss->ebp = kvm_register_read(vcpu, VCPU_REGS_RBP);
3737         tss->esi = kvm_register_read(vcpu, VCPU_REGS_RSI);
3738         tss->edi = kvm_register_read(vcpu, VCPU_REGS_RDI);
3739         tss->es = get_segment_selector(vcpu, VCPU_SREG_ES);
3740         tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS);
3741         tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS);
3742         tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS);
3743         tss->fs = get_segment_selector(vcpu, VCPU_SREG_FS);
3744         tss->gs = get_segment_selector(vcpu, VCPU_SREG_GS);
3745         tss->ldt_selector = get_segment_selector(vcpu, VCPU_SREG_LDTR);
3746         tss->prev_task_link = get_segment_selector(vcpu, VCPU_SREG_TR);
3747 }
3748
3749 static int load_state_from_tss32(struct kvm_vcpu *vcpu,
3750                                   struct tss_segment_32 *tss)
3751 {
3752         kvm_set_cr3(vcpu, tss->cr3);
3753
3754         kvm_rip_write(vcpu, tss->eip);
3755         kvm_x86_ops->set_rflags(vcpu, tss->eflags | 2);
3756
3757         kvm_register_write(vcpu, VCPU_REGS_RAX, tss->eax);
3758         kvm_register_write(vcpu, VCPU_REGS_RCX, tss->ecx);
3759         kvm_register_write(vcpu, VCPU_REGS_RDX, tss->edx);
3760         kvm_register_write(vcpu, VCPU_REGS_RBX, tss->ebx);
3761         kvm_register_write(vcpu, VCPU_REGS_RSP, tss->esp);
3762         kvm_register_write(vcpu, VCPU_REGS_RBP, tss->ebp);
3763         kvm_register_write(vcpu, VCPU_REGS_RSI, tss->esi);
3764         kvm_register_write(vcpu, VCPU_REGS_RDI, tss->edi);
3765
3766         if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, 0, VCPU_SREG_LDTR))
3767                 return 1;
3768
3769         if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES))
3770                 return 1;
3771
3772         if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS))
3773                 return 1;
3774
3775         if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS))
3776                 return 1;
3777
3778         if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS))
3779                 return 1;
3780
3781         if (kvm_load_segment_descriptor(vcpu, tss->fs, 1, VCPU_SREG_FS))
3782                 return 1;
3783
3784         if (kvm_load_segment_descriptor(vcpu, tss->gs, 1, VCPU_SREG_GS))
3785                 return 1;
3786         return 0;
3787 }
3788
3789 static void save_state_to_tss16(struct kvm_vcpu *vcpu,
3790                                 struct tss_segment_16 *tss)
3791 {
3792         tss->ip = kvm_rip_read(vcpu);
3793         tss->flag = kvm_x86_ops->get_rflags(vcpu);
3794         tss->ax = kvm_register_read(vcpu, VCPU_REGS_RAX);
3795         tss->cx = kvm_register_read(vcpu, VCPU_REGS_RCX);
3796         tss->dx = kvm_register_read(vcpu, VCPU_REGS_RDX);
3797         tss->bx = kvm_register_read(vcpu, VCPU_REGS_RBX);
3798         tss->sp = kvm_register_read(vcpu, VCPU_REGS_RSP);
3799         tss->bp = kvm_register_read(vcpu, VCPU_REGS_RBP);
3800         tss->si = kvm_register_read(vcpu, VCPU_REGS_RSI);
3801         tss->di = kvm_register_read(vcpu, VCPU_REGS_RDI);
3802
3803         tss->es = get_segment_selector(vcpu, VCPU_SREG_ES);
3804         tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS);
3805         tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS);
3806         tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS);
3807         tss->ldt = get_segment_selector(vcpu, VCPU_SREG_LDTR);
3808         tss->prev_task_link = get_segment_selector(vcpu, VCPU_SREG_TR);
3809 }
3810
3811 static int load_state_from_tss16(struct kvm_vcpu *vcpu,
3812                                  struct tss_segment_16 *tss)
3813 {
3814         kvm_rip_write(vcpu, tss->ip);
3815         kvm_x86_ops->set_rflags(vcpu, tss->flag | 2);
3816         kvm_register_write(vcpu, VCPU_REGS_RAX, tss->ax);
3817         kvm_register_write(vcpu, VCPU_REGS_RCX, tss->cx);
3818         kvm_register_write(vcpu, VCPU_REGS_RDX, tss->dx);
3819         kvm_register_write(vcpu, VCPU_REGS_RBX, tss->bx);
3820         kvm_register_write(vcpu, VCPU_REGS_RSP, tss->sp);
3821         kvm_register_write(vcpu, VCPU_REGS_RBP, tss->bp);
3822         kvm_register_write(vcpu, VCPU_REGS_RSI, tss->si);
3823         kvm_register_write(vcpu, VCPU_REGS_RDI, tss->di);
3824
3825         if (kvm_load_segment_descriptor(vcpu, tss->ldt, 0, VCPU_SREG_LDTR))
3826                 return 1;
3827
3828         if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES))
3829                 return 1;
3830
3831         if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS))
3832                 return 1;
3833
3834         if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS))
3835                 return 1;
3836
3837         if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS))
3838                 return 1;
3839         return 0;
3840 }
3841
3842 static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector,
3843                        u32 old_tss_base,
3844                        struct desc_struct *nseg_desc)
3845 {
3846         struct tss_segment_16 tss_segment_16;
3847         int ret = 0;
3848
3849         if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_16,
3850                            sizeof tss_segment_16))
3851                 goto out;
3852
3853         save_state_to_tss16(vcpu, &tss_segment_16);
3854
3855         if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_16,
3856                             sizeof tss_segment_16))
3857                 goto out;
3858
3859         if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc),
3860                            &tss_segment_16, sizeof tss_segment_16))
3861                 goto out;
3862
3863         if (load_state_from_tss16(vcpu, &tss_segment_16))
3864                 goto out;
3865
3866         ret = 1;
3867 out:
3868         return ret;
3869 }
3870
3871 static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector,
3872                        u32 old_tss_base,
3873                        struct desc_struct *nseg_desc)
3874 {
3875         struct tss_segment_32 tss_segment_32;
3876         int ret = 0;
3877
3878         if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_32,
3879                            sizeof tss_segment_32))
3880                 goto out;
3881
3882         save_state_to_tss32(vcpu, &tss_segment_32);
3883
3884         if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_32,
3885                             sizeof tss_segment_32))
3886                 goto out;
3887
3888         if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc),
3889                            &tss_segment_32, sizeof tss_segment_32))
3890                 goto out;
3891
3892         if (load_state_from_tss32(vcpu, &tss_segment_32))
3893                 goto out;
3894
3895         ret = 1;
3896 out:
3897         return ret;
3898 }
3899
3900 int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
3901 {
3902         struct kvm_segment tr_seg;
3903         struct desc_struct cseg_desc;
3904         struct desc_struct nseg_desc;
3905         int ret = 0;
3906         u32 old_tss_base = get_segment_base(vcpu, VCPU_SREG_TR);
3907         u16 old_tss_sel = get_segment_selector(vcpu, VCPU_SREG_TR);
3908
3909         old_tss_base = vcpu->arch.mmu.gva_to_gpa(vcpu, old_tss_base);
3910
3911         /* FIXME: Handle errors. Failure to read either TSS or their
3912          * descriptors should generate a pagefault.
3913          */
3914         if (load_guest_segment_descriptor(vcpu, tss_selector, &nseg_desc))
3915                 goto out;
3916
3917         if (load_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc))
3918                 goto out;
3919
3920         if (reason != TASK_SWITCH_IRET) {
3921                 int cpl;
3922
3923                 cpl = kvm_x86_ops->get_cpl(vcpu);
3924                 if ((tss_selector & 3) > nseg_desc.dpl || cpl > nseg_desc.dpl) {
3925                         kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
3926                         return 1;
3927                 }
3928         }
3929
3930         if (!nseg_desc.p || (nseg_desc.limit0 | nseg_desc.limit << 16) < 0x67) {
3931                 kvm_queue_exception_e(vcpu, TS_VECTOR, tss_selector & 0xfffc);
3932                 return 1;
3933         }
3934
3935         if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) {
3936                 cseg_desc.type &= ~(1 << 1); //clear the B flag
3937                 save_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc);
3938         }
3939
3940         if (reason == TASK_SWITCH_IRET) {
3941                 u32 eflags = kvm_x86_ops->get_rflags(vcpu);
3942                 kvm_x86_ops->set_rflags(vcpu, eflags & ~X86_EFLAGS_NT);
3943         }
3944
3945         kvm_x86_ops->skip_emulated_instruction(vcpu);
3946
3947         if (nseg_desc.type & 8)
3948                 ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_base,
3949                                          &nseg_desc);
3950         else
3951                 ret = kvm_task_switch_16(vcpu, tss_selector, old_tss_base,
3952                                          &nseg_desc);
3953
3954         if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) {
3955                 u32 eflags = kvm_x86_ops->get_rflags(vcpu);
3956                 kvm_x86_ops->set_rflags(vcpu, eflags | X86_EFLAGS_NT);
3957         }
3958
3959         if (reason != TASK_SWITCH_IRET) {
3960                 nseg_desc.type |= (1 << 1);
3961                 save_guest_segment_descriptor(vcpu, tss_selector,
3962                                               &nseg_desc);
3963         }
3964
3965         kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 | X86_CR0_TS);
3966         seg_desct_to_kvm_desct(&nseg_desc, tss_selector, &tr_seg);
3967         tr_seg.type = 11;
3968         kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR);
3969 out:
3970         return ret;
3971 }
3972 EXPORT_SYMBOL_GPL(kvm_task_switch);
3973
3974 int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
3975                                   struct kvm_sregs *sregs)
3976 {
3977         int mmu_reset_needed = 0;
3978         int i, pending_vec, max_bits;
3979         struct descriptor_table dt;
3980
3981         vcpu_load(vcpu);
3982
3983         dt.limit = sregs->idt.limit;
3984         dt.base = sregs->idt.base;
3985         kvm_x86_ops->set_idt(vcpu, &dt);
3986         dt.limit = sregs->gdt.limit;
3987         dt.base = sregs->gdt.base;
3988         kvm_x86_ops->set_gdt(vcpu, &dt);
3989
3990         vcpu->arch.cr2 = sregs->cr2;
3991         mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3;
3992
3993         down_read(&vcpu->kvm->slots_lock);
3994         if (gfn_to_memslot(vcpu->kvm, sregs->cr3 >> PAGE_SHIFT))
3995                 vcpu->arch.cr3 = sregs->cr3;
3996         else
3997                 set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
3998         up_read(&vcpu->kvm->slots_lock);
3999
4000         kvm_set_cr8(vcpu, sregs->cr8);
4001
4002         mmu_reset_needed |= vcpu->arch.shadow_efer != sregs->efer;
4003         kvm_x86_ops->set_efer(vcpu, sregs->efer);
4004         kvm_set_apic_base(vcpu, sregs->apic_base);
4005
4006         kvm_x86_ops->decache_cr4_guest_bits(vcpu);
4007
4008         mmu_reset_needed |= vcpu->arch.cr0 != sregs->cr0;
4009         kvm_x86_ops->set_cr0(vcpu, sregs->cr0);
4010         vcpu->arch.cr0 = sregs->cr0;
4011
4012         mmu_reset_needed |= vcpu->arch.cr4 != sregs->cr4;
4013         kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
4014         if (!is_long_mode(vcpu) && is_pae(vcpu))
4015                 load_pdptrs(vcpu, vcpu->arch.cr3);
4016
4017         if (mmu_reset_needed)
4018                 kvm_mmu_reset_context(vcpu);
4019
4020         if (!irqchip_in_kernel(vcpu->kvm)) {
4021                 memcpy(vcpu->arch.irq_pending, sregs->interrupt_bitmap,
4022                        sizeof vcpu->arch.irq_pending);
4023                 vcpu->arch.irq_summary = 0;
4024                 for (i = 0; i < ARRAY_SIZE(vcpu->arch.irq_pending); ++i)
4025                         if (vcpu->arch.irq_pending[i])
4026                                 __set_bit(i, &vcpu->arch.irq_summary);
4027         } else {
4028                 max_bits = (sizeof sregs->interrupt_bitmap) << 3;
4029                 pending_vec = find_first_bit(
4030                         (const unsigned long *)sregs->interrupt_bitmap,
4031                         max_bits);
4032                 /* Only pending external irq is handled here */
4033                 if (pending_vec < max_bits) {
4034                         kvm_x86_ops->set_irq(vcpu, pending_vec);
4035                         pr_debug("Set back pending irq %d\n",
4036                                  pending_vec);
4037                 }
4038                 kvm_pic_clear_isr_ack(vcpu->kvm);
4039         }
4040
4041         kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
4042         kvm_set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
4043         kvm_set_segment(vcpu, &sregs->es, VCPU_SREG_ES);
4044         kvm_set_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
4045         kvm_set_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
4046         kvm_set_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
4047
4048         kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
4049         kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
4050
4051         /* Older userspace won't unhalt the vcpu on reset. */
4052         if (vcpu->vcpu_id == 0 && kvm_rip_read(vcpu) == 0xfff0 &&
4053             sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 &&
4054             !(vcpu->arch.cr0 & X86_CR0_PE))
4055                 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
4056
4057         vcpu_put(vcpu);
4058
4059         return 0;
4060 }
4061
4062 int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
4063                                         struct kvm_guest_debug *dbg)
4064 {
4065         int i, r;
4066
4067         vcpu_load(vcpu);
4068
4069         if ((dbg->control & (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP)) ==
4070             (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP)) {
4071                 for (i = 0; i < KVM_NR_DB_REGS; ++i)
4072                         vcpu->arch.eff_db[i] = dbg->arch.debugreg[i];
4073                 vcpu->arch.switch_db_regs =
4074                         (dbg->arch.debugreg[7] & DR7_BP_EN_MASK);
4075         } else {
4076                 for (i = 0; i < KVM_NR_DB_REGS; i++)
4077                         vcpu->arch.eff_db[i] = vcpu->arch.db[i];
4078                 vcpu->arch.switch_db_regs = (vcpu->arch.dr7 & DR7_BP_EN_MASK);
4079         }
4080
4081         r = kvm_x86_ops->set_guest_debug(vcpu, dbg);
4082
4083         if (dbg->control & KVM_GUESTDBG_INJECT_DB)
4084                 kvm_queue_exception(vcpu, DB_VECTOR);
4085         else if (dbg->control & KVM_GUESTDBG_INJECT_BP)
4086                 kvm_queue_exception(vcpu, BP_VECTOR);
4087
4088         vcpu_put(vcpu);
4089
4090         return r;
4091 }
4092
4093 /*
4094  * fxsave fpu state.  Taken from x86_64/processor.h.  To be killed when
4095  * we have asm/x86/processor.h
4096  */
4097 struct fxsave {
4098         u16     cwd;
4099         u16     swd;
4100         u16     twd;
4101         u16     fop;
4102         u64     rip;
4103         u64     rdp;
4104         u32     mxcsr;
4105         u32     mxcsr_mask;
4106         u32     st_space[32];   /* 8*16 bytes for each FP-reg = 128 bytes */
4107 #ifdef CONFIG_X86_64
4108         u32     xmm_space[64];  /* 16*16 bytes for each XMM-reg = 256 bytes */
4109 #else
4110         u32     xmm_space[32];  /* 8*16 bytes for each XMM-reg = 128 bytes */
4111 #endif
4112 };
4113
4114 /*
4115  * Translate a guest virtual address to a guest physical address.
4116  */
4117 int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
4118                                     struct kvm_translation *tr)
4119 {
4120         unsigned long vaddr = tr->linear_address;
4121         gpa_t gpa;
4122
4123         vcpu_load(vcpu);
4124         down_read(&vcpu->kvm->slots_lock);
4125         gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, vaddr);
4126         up_read(&vcpu->kvm->slots_lock);
4127         tr->physical_address = gpa;
4128         tr->valid = gpa != UNMAPPED_GVA;
4129         tr->writeable = 1;
4130         tr->usermode = 0;
4131         vcpu_put(vcpu);
4132
4133         return 0;
4134 }
4135
4136 int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
4137 {
4138         struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image;
4139
4140         vcpu_load(vcpu);
4141
4142         memcpy(fpu->fpr, fxsave->st_space, 128);
4143         fpu->fcw = fxsave->cwd;
4144         fpu->fsw = fxsave->swd;
4145         fpu->ftwx = fxsave->twd;
4146         fpu->last_opcode = fxsave->fop;
4147         fpu->last_ip = fxsave->rip;
4148         fpu->last_dp = fxsave->rdp;
4149         memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space);
4150
4151         vcpu_put(vcpu);
4152
4153         return 0;
4154 }
4155
4156 int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
4157 {
4158         struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image;
4159
4160         vcpu_load(vcpu);
4161
4162         memcpy(fxsave->st_space, fpu->fpr, 128);
4163         fxsave->cwd = fpu->fcw;
4164         fxsave->swd = fpu->fsw;
4165         fxsave->twd = fpu->ftwx;
4166         fxsave->fop = fpu->last_opcode;
4167         fxsave->rip = fpu->last_ip;
4168         fxsave->rdp = fpu->last_dp;
4169         memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space);
4170
4171         vcpu_put(vcpu);
4172
4173         return 0;
4174 }
4175
4176 void fx_init(struct kvm_vcpu *vcpu)
4177 {
4178         unsigned after_mxcsr_mask;
4179
4180         /*
4181          * Touch the fpu the first time in non atomic context as if
4182          * this is the first fpu instruction the exception handler
4183          * will fire before the instruction returns and it'll have to
4184          * allocate ram with GFP_KERNEL.
4185          */
4186         if (!used_math())
4187                 kvm_fx_save(&vcpu->arch.host_fx_image);
4188
4189         /* Initialize guest FPU by resetting ours and saving into guest's */
4190         preempt_disable();
4191         kvm_fx_save(&vcpu->arch.host_fx_image);
4192         kvm_fx_finit();
4193         kvm_fx_save(&vcpu->arch.guest_fx_image);
4194         kvm_fx_restore(&vcpu->arch.host_fx_image);
4195         preempt_enable();
4196
4197         vcpu->arch.cr0 |= X86_CR0_ET;
4198         after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space);
4199         vcpu->arch.guest_fx_image.mxcsr = 0x1f80;
4200         memset((void *)&vcpu->arch.guest_fx_image + after_mxcsr_mask,
4201                0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask);
4202 }
4203 EXPORT_SYMBOL_GPL(fx_init);
4204
4205 void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
4206 {
4207         if (!vcpu->fpu_active || vcpu->guest_fpu_loaded)
4208                 return;
4209
4210         vcpu->guest_fpu_loaded = 1;
4211         kvm_fx_save(&vcpu->arch.host_fx_image);
4212         kvm_fx_restore(&vcpu->arch.guest_fx_image);
4213 }
4214 EXPORT_SYMBOL_GPL(kvm_load_guest_fpu);
4215
4216 void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
4217 {
4218         if (!vcpu->guest_fpu_loaded)
4219                 return;
4220
4221         vcpu->guest_fpu_loaded = 0;
4222         kvm_fx_save(&vcpu->arch.guest_fx_image);
4223         kvm_fx_restore(&vcpu->arch.host_fx_image);
4224         ++vcpu->stat.fpu_reload;
4225 }
4226 EXPORT_SYMBOL_GPL(kvm_put_guest_fpu);
4227
4228 void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
4229 {
4230         if (vcpu->arch.time_page) {
4231                 kvm_release_page_dirty(vcpu->arch.time_page);
4232                 vcpu->arch.time_page = NULL;
4233         }
4234
4235         kvm_x86_ops->vcpu_free(vcpu);
4236 }
4237
4238 struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
4239                                                 unsigned int id)
4240 {
4241         return kvm_x86_ops->vcpu_create(kvm, id);
4242 }
4243
4244 int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
4245 {
4246         int r;
4247
4248         /* We do fxsave: this must be aligned. */
4249         BUG_ON((unsigned long)&vcpu->arch.host_fx_image & 0xF);
4250
4251         vcpu->arch.mtrr_state.have_fixed = 1;
4252         vcpu_load(vcpu);
4253         r = kvm_arch_vcpu_reset(vcpu);
4254         if (r == 0)
4255                 r = kvm_mmu_setup(vcpu);
4256         vcpu_put(vcpu);
4257         if (r < 0)
4258                 goto free_vcpu;
4259
4260         return 0;
4261 free_vcpu:
4262         kvm_x86_ops->vcpu_free(vcpu);
4263         return r;
4264 }
4265
4266 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
4267 {
4268         vcpu_load(vcpu);
4269         kvm_mmu_unload(vcpu);
4270         vcpu_put(vcpu);
4271
4272         kvm_x86_ops->vcpu_free(vcpu);
4273 }
4274
4275 int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
4276 {
4277         vcpu->arch.nmi_pending = false;
4278         vcpu->arch.nmi_injected = false;
4279
4280         vcpu->arch.switch_db_regs = 0;
4281         memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db));
4282         vcpu->arch.dr6 = DR6_FIXED_1;
4283         vcpu->arch.dr7 = DR7_FIXED_1;
4284
4285         return kvm_x86_ops->vcpu_reset(vcpu);
4286 }
4287
4288 void kvm_arch_hardware_enable(void *garbage)
4289 {
4290         kvm_x86_ops->hardware_enable(garbage);
4291 }
4292
4293 void kvm_arch_hardware_disable(void *garbage)
4294 {
4295         kvm_x86_ops->hardware_disable(garbage);
4296 }
4297
4298 int kvm_arch_hardware_setup(void)
4299 {
4300         return kvm_x86_ops->hardware_setup();
4301 }
4302
4303 void kvm_arch_hardware_unsetup(void)
4304 {
4305         kvm_x86_ops->hardware_unsetup();
4306 }
4307
4308 void kvm_arch_check_processor_compat(void *rtn)
4309 {
4310         kvm_x86_ops->check_processor_compatibility(rtn);
4311 }
4312
4313 int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
4314 {
4315         struct page *page;
4316         struct kvm *kvm;
4317         int r;
4318
4319         BUG_ON(vcpu->kvm == NULL);
4320         kvm = vcpu->kvm;
4321
4322         vcpu->arch.mmu.root_hpa = INVALID_PAGE;
4323         if (!irqchip_in_kernel(kvm) || vcpu->vcpu_id == 0)
4324                 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
4325         else
4326                 vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
4327
4328         page = alloc_page(GFP_KERNEL | __GFP_ZERO);
4329         if (!page) {
4330                 r = -ENOMEM;
4331                 goto fail;
4332         }
4333         vcpu->arch.pio_data = page_address(page);
4334
4335         r = kvm_mmu_create(vcpu);
4336         if (r < 0)
4337                 goto fail_free_pio_data;
4338
4339         if (irqchip_in_kernel(kvm)) {
4340                 r = kvm_create_lapic(vcpu);
4341                 if (r < 0)
4342                         goto fail_mmu_destroy;
4343         }
4344
4345         return 0;
4346
4347 fail_mmu_destroy:
4348         kvm_mmu_destroy(vcpu);
4349 fail_free_pio_data:
4350         free_page((unsigned long)vcpu->arch.pio_data);
4351 fail:
4352         return r;
4353 }
4354
4355 void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
4356 {
4357         kvm_free_lapic(vcpu);
4358         down_read(&vcpu->kvm->slots_lock);
4359         kvm_mmu_destroy(vcpu);
4360         up_read(&vcpu->kvm->slots_lock);
4361         free_page((unsigned long)vcpu->arch.pio_data);
4362 }
4363
4364 struct  kvm *kvm_arch_create_vm(void)
4365 {
4366         struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
4367
4368         if (!kvm)
4369                 return ERR_PTR(-ENOMEM);
4370
4371         INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
4372         INIT_LIST_HEAD(&kvm->arch.oos_global_pages);
4373         INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
4374
4375         /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
4376         set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap);
4377
4378         rdtscll(kvm->arch.vm_init_tsc);
4379
4380         return kvm;
4381 }
4382
4383 static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
4384 {
4385         vcpu_load(vcpu);
4386         kvm_mmu_unload(vcpu);
4387         vcpu_put(vcpu);
4388 }
4389
4390 static void kvm_free_vcpus(struct kvm *kvm)
4391 {
4392         unsigned int i;
4393
4394         /*
4395          * Unpin any mmu pages first.
4396          */
4397         for (i = 0; i < KVM_MAX_VCPUS; ++i)
4398                 if (kvm->vcpus[i])
4399                         kvm_unload_vcpu_mmu(kvm->vcpus[i]);
4400         for (i = 0; i < KVM_MAX_VCPUS; ++i) {
4401                 if (kvm->vcpus[i]) {
4402                         kvm_arch_vcpu_free(kvm->vcpus[i]);
4403                         kvm->vcpus[i] = NULL;
4404                 }
4405         }
4406
4407 }
4408
4409 void kvm_arch_sync_events(struct kvm *kvm)
4410 {
4411         kvm_free_all_assigned_devices(kvm);
4412 }
4413
4414 void kvm_arch_destroy_vm(struct kvm *kvm)
4415 {
4416         kvm_iommu_unmap_guest(kvm);
4417         kvm_free_pit(kvm);
4418         kfree(kvm->arch.vpic);
4419         kfree(kvm->arch.vioapic);
4420         kvm_free_vcpus(kvm);
4421         kvm_free_physmem(kvm);
4422         if (kvm->arch.apic_access_page)
4423                 put_page(kvm->arch.apic_access_page);
4424         if (kvm->arch.ept_identity_pagetable)
4425                 put_page(kvm->arch.ept_identity_pagetable);
4426         kfree(kvm);
4427 }
4428
4429 int kvm_arch_set_memory_region(struct kvm *kvm,
4430                                 struct kvm_userspace_memory_region *mem,
4431                                 struct kvm_memory_slot old,
4432                                 int user_alloc)
4433 {
4434         int npages = mem->memory_size >> PAGE_SHIFT;
4435         struct kvm_memory_slot *memslot = &kvm->memslots[mem->slot];
4436
4437         /*To keep backward compatibility with older userspace,
4438          *x86 needs to hanlde !user_alloc case.
4439          */
4440         if (!user_alloc) {
4441                 if (npages && !old.rmap) {
4442                         unsigned long userspace_addr;
4443
4444                         down_write(&current->mm->mmap_sem);
4445                         userspace_addr = do_mmap(NULL, 0,
4446                                                  npages * PAGE_SIZE,
4447                                                  PROT_READ | PROT_WRITE,
4448                                                  MAP_PRIVATE | MAP_ANONYMOUS,
4449                                                  0);
4450                         up_write(&current->mm->mmap_sem);
4451
4452                         if (IS_ERR((void *)userspace_addr))
4453                                 return PTR_ERR((void *)userspace_addr);
4454
4455                         /* set userspace_addr atomically for kvm_hva_to_rmapp */
4456                         spin_lock(&kvm->mmu_lock);
4457                         memslot->userspace_addr = userspace_addr;
4458                         spin_unlock(&kvm->mmu_lock);
4459                 } else {
4460                         if (!old.user_alloc && old.rmap) {
4461                                 int ret;
4462
4463                                 down_write(&current->mm->mmap_sem);
4464                                 ret = do_munmap(current->mm, old.userspace_addr,
4465                                                 old.npages * PAGE_SIZE);
4466                                 up_write(&current->mm->mmap_sem);
4467                                 if (ret < 0)
4468                                         printk(KERN_WARNING
4469                                        "kvm_vm_ioctl_set_memory_region: "
4470                                        "failed to munmap memory\n");
4471                         }
4472                 }
4473         }
4474
4475         spin_lock(&kvm->mmu_lock);
4476         if (!kvm->arch.n_requested_mmu_pages) {
4477                 unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
4478                 kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
4479         }
4480
4481         kvm_mmu_slot_remove_write_access(kvm, mem->slot);
4482         spin_unlock(&kvm->mmu_lock);
4483         kvm_flush_remote_tlbs(kvm);
4484
4485         return 0;
4486 }
4487
4488 void kvm_arch_flush_shadow(struct kvm *kvm)
4489 {
4490         kvm_mmu_zap_all(kvm);
4491         kvm_reload_remote_mmus(kvm);
4492 }
4493
4494 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
4495 {
4496         return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE
4497                || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED
4498                || vcpu->arch.nmi_pending;
4499 }
4500
4501 static void vcpu_kick_intr(void *info)
4502 {
4503 #ifdef DEBUG
4504         struct kvm_vcpu *vcpu = (struct kvm_vcpu *)info;
4505         printk(KERN_DEBUG "vcpu_kick_intr %p \n", vcpu);
4506 #endif
4507 }
4508
4509 void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
4510 {
4511         int ipi_pcpu = vcpu->cpu;
4512         int cpu = get_cpu();
4513
4514         if (waitqueue_active(&vcpu->wq)) {
4515                 wake_up_interruptible(&vcpu->wq);
4516                 ++vcpu->stat.halt_wakeup;
4517         }
4518         /*
4519          * We may be called synchronously with irqs disabled in guest mode,
4520          * So need not to call smp_call_function_single() in that case.
4521          */
4522         if (vcpu->guest_mode && vcpu->cpu != cpu)
4523                 smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0);
4524         put_cpu();
4525 }