KVM: MMU: do not free active mmu pages in free_mmu_pages()
[linux-2.6/linux-acpi-2.6/ibm-acpi-2.6.git] / virt / kvm / kvm_main.c
blob53ab86d46698c1fd316bd96f0cbbbf7fb8a11e20
1 /*
2 * Kernel-based Virtual Machine driver for Linux
4 * This module enables machines with Intel VT-x extensions to run virtual
5 * machines without emulation or binary translation.
7 * Copyright (C) 2006 Qumranet, Inc.
9 * Authors:
10 * Avi Kivity <avi@qumranet.com>
11 * Yaniv Kamay <yaniv@qumranet.com>
13 * This work is licensed under the terms of the GNU GPL, version 2. See
14 * the COPYING file in the top-level directory.
18 #include "iodev.h"
20 #include <linux/kvm_host.h>
21 #include <linux/kvm.h>
22 #include <linux/module.h>
23 #include <linux/errno.h>
24 #include <linux/percpu.h>
25 #include <linux/gfp.h>
26 #include <linux/mm.h>
27 #include <linux/miscdevice.h>
28 #include <linux/vmalloc.h>
29 #include <linux/reboot.h>
30 #include <linux/debugfs.h>
31 #include <linux/highmem.h>
32 #include <linux/file.h>
33 #include <linux/sysdev.h>
34 #include <linux/cpu.h>
35 #include <linux/sched.h>
36 #include <linux/cpumask.h>
37 #include <linux/smp.h>
38 #include <linux/anon_inodes.h>
39 #include <linux/profile.h>
40 #include <linux/kvm_para.h>
41 #include <linux/pagemap.h>
42 #include <linux/mman.h>
43 #include <linux/swap.h>
45 #include <asm/processor.h>
46 #include <asm/io.h>
47 #include <asm/uaccess.h>
48 #include <asm/pgtable.h>
50 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
51 #include "coalesced_mmio.h"
52 #endif
54 MODULE_AUTHOR("Qumranet");
55 MODULE_LICENSE("GPL");
57 DEFINE_SPINLOCK(kvm_lock);
58 LIST_HEAD(vm_list);
60 static cpumask_t cpus_hardware_enabled;
62 struct kmem_cache *kvm_vcpu_cache;
63 EXPORT_SYMBOL_GPL(kvm_vcpu_cache);
65 static __read_mostly struct preempt_ops kvm_preempt_ops;
67 struct dentry *kvm_debugfs_dir;
69 static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
70 unsigned long arg);
72 bool kvm_rebooting;
74 static inline int valid_vcpu(int n)
76 return likely(n >= 0 && n < KVM_MAX_VCPUS);
80 * Switches to specified vcpu, until a matching vcpu_put()
82 void vcpu_load(struct kvm_vcpu *vcpu)
84 int cpu;
86 mutex_lock(&vcpu->mutex);
87 cpu = get_cpu();
88 preempt_notifier_register(&vcpu->preempt_notifier);
89 kvm_arch_vcpu_load(vcpu, cpu);
90 put_cpu();
93 void vcpu_put(struct kvm_vcpu *vcpu)
95 preempt_disable();
96 kvm_arch_vcpu_put(vcpu);
97 preempt_notifier_unregister(&vcpu->preempt_notifier);
98 preempt_enable();
99 mutex_unlock(&vcpu->mutex);
102 static void ack_flush(void *_completed)
106 void kvm_flush_remote_tlbs(struct kvm *kvm)
108 int i, cpu, me;
109 cpumask_t cpus;
110 struct kvm_vcpu *vcpu;
112 me = get_cpu();
113 cpus_clear(cpus);
114 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
115 vcpu = kvm->vcpus[i];
116 if (!vcpu)
117 continue;
118 if (test_and_set_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
119 continue;
120 cpu = vcpu->cpu;
121 if (cpu != -1 && cpu != me)
122 cpu_set(cpu, cpus);
124 if (cpus_empty(cpus))
125 goto out;
126 ++kvm->stat.remote_tlb_flush;
127 smp_call_function_mask(cpus, ack_flush, NULL, 1);
128 out:
129 put_cpu();
132 void kvm_reload_remote_mmus(struct kvm *kvm)
134 int i, cpu, me;
135 cpumask_t cpus;
136 struct kvm_vcpu *vcpu;
138 me = get_cpu();
139 cpus_clear(cpus);
140 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
141 vcpu = kvm->vcpus[i];
142 if (!vcpu)
143 continue;
144 if (test_and_set_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests))
145 continue;
146 cpu = vcpu->cpu;
147 if (cpu != -1 && cpu != me)
148 cpu_set(cpu, cpus);
150 if (cpus_empty(cpus))
151 goto out;
152 smp_call_function_mask(cpus, ack_flush, NULL, 1);
153 out:
154 put_cpu();
158 int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
160 struct page *page;
161 int r;
163 mutex_init(&vcpu->mutex);
164 vcpu->cpu = -1;
165 vcpu->kvm = kvm;
166 vcpu->vcpu_id = id;
167 init_waitqueue_head(&vcpu->wq);
169 page = alloc_page(GFP_KERNEL | __GFP_ZERO);
170 if (!page) {
171 r = -ENOMEM;
172 goto fail;
174 vcpu->run = page_address(page);
176 r = kvm_arch_vcpu_init(vcpu);
177 if (r < 0)
178 goto fail_free_run;
179 return 0;
181 fail_free_run:
182 free_page((unsigned long)vcpu->run);
183 fail:
184 return r;
186 EXPORT_SYMBOL_GPL(kvm_vcpu_init);
188 void kvm_vcpu_uninit(struct kvm_vcpu *vcpu)
190 kvm_arch_vcpu_uninit(vcpu);
191 free_page((unsigned long)vcpu->run);
193 EXPORT_SYMBOL_GPL(kvm_vcpu_uninit);
195 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
196 static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn)
198 return container_of(mn, struct kvm, mmu_notifier);
201 static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn,
202 struct mm_struct *mm,
203 unsigned long address)
205 struct kvm *kvm = mmu_notifier_to_kvm(mn);
206 int need_tlb_flush;
209 * When ->invalidate_page runs, the linux pte has been zapped
210 * already but the page is still allocated until
211 * ->invalidate_page returns. So if we increase the sequence
212 * here the kvm page fault will notice if the spte can't be
213 * established because the page is going to be freed. If
214 * instead the kvm page fault establishes the spte before
215 * ->invalidate_page runs, kvm_unmap_hva will release it
216 * before returning.
218 * The sequence increase only need to be seen at spin_unlock
219 * time, and not at spin_lock time.
221 * Increasing the sequence after the spin_unlock would be
222 * unsafe because the kvm page fault could then establish the
223 * pte after kvm_unmap_hva returned, without noticing the page
224 * is going to be freed.
226 spin_lock(&kvm->mmu_lock);
227 kvm->mmu_notifier_seq++;
228 need_tlb_flush = kvm_unmap_hva(kvm, address);
229 spin_unlock(&kvm->mmu_lock);
231 /* we've to flush the tlb before the pages can be freed */
232 if (need_tlb_flush)
233 kvm_flush_remote_tlbs(kvm);
237 static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
238 struct mm_struct *mm,
239 unsigned long start,
240 unsigned long end)
242 struct kvm *kvm = mmu_notifier_to_kvm(mn);
243 int need_tlb_flush = 0;
245 spin_lock(&kvm->mmu_lock);
247 * The count increase must become visible at unlock time as no
248 * spte can be established without taking the mmu_lock and
249 * count is also read inside the mmu_lock critical section.
251 kvm->mmu_notifier_count++;
252 for (; start < end; start += PAGE_SIZE)
253 need_tlb_flush |= kvm_unmap_hva(kvm, start);
254 spin_unlock(&kvm->mmu_lock);
256 /* we've to flush the tlb before the pages can be freed */
257 if (need_tlb_flush)
258 kvm_flush_remote_tlbs(kvm);
261 static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
262 struct mm_struct *mm,
263 unsigned long start,
264 unsigned long end)
266 struct kvm *kvm = mmu_notifier_to_kvm(mn);
268 spin_lock(&kvm->mmu_lock);
270 * This sequence increase will notify the kvm page fault that
271 * the page that is going to be mapped in the spte could have
272 * been freed.
274 kvm->mmu_notifier_seq++;
276 * The above sequence increase must be visible before the
277 * below count decrease but both values are read by the kvm
278 * page fault under mmu_lock spinlock so we don't need to add
279 * a smb_wmb() here in between the two.
281 kvm->mmu_notifier_count--;
282 spin_unlock(&kvm->mmu_lock);
284 BUG_ON(kvm->mmu_notifier_count < 0);
287 static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
288 struct mm_struct *mm,
289 unsigned long address)
291 struct kvm *kvm = mmu_notifier_to_kvm(mn);
292 int young;
294 spin_lock(&kvm->mmu_lock);
295 young = kvm_age_hva(kvm, address);
296 spin_unlock(&kvm->mmu_lock);
298 if (young)
299 kvm_flush_remote_tlbs(kvm);
301 return young;
304 static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
305 .invalidate_page = kvm_mmu_notifier_invalidate_page,
306 .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
307 .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end,
308 .clear_flush_young = kvm_mmu_notifier_clear_flush_young,
310 #endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */
312 static struct kvm *kvm_create_vm(void)
314 struct kvm *kvm = kvm_arch_create_vm();
315 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
316 struct page *page;
317 #endif
319 if (IS_ERR(kvm))
320 goto out;
322 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
323 page = alloc_page(GFP_KERNEL | __GFP_ZERO);
324 if (!page) {
325 kfree(kvm);
326 return ERR_PTR(-ENOMEM);
328 kvm->coalesced_mmio_ring =
329 (struct kvm_coalesced_mmio_ring *)page_address(page);
330 #endif
332 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
334 int err;
335 kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops;
336 err = mmu_notifier_register(&kvm->mmu_notifier, current->mm);
337 if (err) {
338 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
339 put_page(page);
340 #endif
341 kfree(kvm);
342 return ERR_PTR(err);
345 #endif
347 kvm->mm = current->mm;
348 atomic_inc(&kvm->mm->mm_count);
349 spin_lock_init(&kvm->mmu_lock);
350 kvm_io_bus_init(&kvm->pio_bus);
351 mutex_init(&kvm->lock);
352 kvm_io_bus_init(&kvm->mmio_bus);
353 init_rwsem(&kvm->slots_lock);
354 atomic_set(&kvm->users_count, 1);
355 spin_lock(&kvm_lock);
356 list_add(&kvm->vm_list, &vm_list);
357 spin_unlock(&kvm_lock);
358 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
359 kvm_coalesced_mmio_init(kvm);
360 #endif
361 out:
362 return kvm;
366 * Free any memory in @free but not in @dont.
368 static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
369 struct kvm_memory_slot *dont)
371 if (!dont || free->rmap != dont->rmap)
372 vfree(free->rmap);
374 if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
375 vfree(free->dirty_bitmap);
377 if (!dont || free->lpage_info != dont->lpage_info)
378 vfree(free->lpage_info);
380 free->npages = 0;
381 free->dirty_bitmap = NULL;
382 free->rmap = NULL;
383 free->lpage_info = NULL;
386 void kvm_free_physmem(struct kvm *kvm)
388 int i;
390 for (i = 0; i < kvm->nmemslots; ++i)
391 kvm_free_physmem_slot(&kvm->memslots[i], NULL);
394 static void kvm_destroy_vm(struct kvm *kvm)
396 struct mm_struct *mm = kvm->mm;
398 spin_lock(&kvm_lock);
399 list_del(&kvm->vm_list);
400 spin_unlock(&kvm_lock);
401 kvm_io_bus_destroy(&kvm->pio_bus);
402 kvm_io_bus_destroy(&kvm->mmio_bus);
403 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
404 if (kvm->coalesced_mmio_ring != NULL)
405 free_page((unsigned long)kvm->coalesced_mmio_ring);
406 #endif
407 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
408 mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
409 #else
410 kvm_arch_flush_shadow(kvm);
411 #endif
412 kvm_arch_destroy_vm(kvm);
413 mmdrop(mm);
416 void kvm_get_kvm(struct kvm *kvm)
418 atomic_inc(&kvm->users_count);
420 EXPORT_SYMBOL_GPL(kvm_get_kvm);
422 void kvm_put_kvm(struct kvm *kvm)
424 if (atomic_dec_and_test(&kvm->users_count))
425 kvm_destroy_vm(kvm);
427 EXPORT_SYMBOL_GPL(kvm_put_kvm);
430 static int kvm_vm_release(struct inode *inode, struct file *filp)
432 struct kvm *kvm = filp->private_data;
434 kvm_put_kvm(kvm);
435 return 0;
439 * Allocate some memory and give it an address in the guest physical address
440 * space.
442 * Discontiguous memory is allowed, mostly for framebuffers.
444 * Must be called holding mmap_sem for write.
446 int __kvm_set_memory_region(struct kvm *kvm,
447 struct kvm_userspace_memory_region *mem,
448 int user_alloc)
450 int r;
451 gfn_t base_gfn;
452 unsigned long npages;
453 unsigned long i;
454 struct kvm_memory_slot *memslot;
455 struct kvm_memory_slot old, new;
457 r = -EINVAL;
458 /* General sanity checks */
459 if (mem->memory_size & (PAGE_SIZE - 1))
460 goto out;
461 if (mem->guest_phys_addr & (PAGE_SIZE - 1))
462 goto out;
463 if (mem->slot >= KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS)
464 goto out;
465 if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
466 goto out;
468 memslot = &kvm->memslots[mem->slot];
469 base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
470 npages = mem->memory_size >> PAGE_SHIFT;
472 if (!npages)
473 mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES;
475 new = old = *memslot;
477 new.base_gfn = base_gfn;
478 new.npages = npages;
479 new.flags = mem->flags;
481 /* Disallow changing a memory slot's size. */
482 r = -EINVAL;
483 if (npages && old.npages && npages != old.npages)
484 goto out_free;
486 /* Check for overlaps */
487 r = -EEXIST;
488 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
489 struct kvm_memory_slot *s = &kvm->memslots[i];
491 if (s == memslot)
492 continue;
493 if (!((base_gfn + npages <= s->base_gfn) ||
494 (base_gfn >= s->base_gfn + s->npages)))
495 goto out_free;
498 /* Free page dirty bitmap if unneeded */
499 if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES))
500 new.dirty_bitmap = NULL;
502 r = -ENOMEM;
504 /* Allocate if a slot is being created */
505 #ifndef CONFIG_S390
506 if (npages && !new.rmap) {
507 new.rmap = vmalloc(npages * sizeof(struct page *));
509 if (!new.rmap)
510 goto out_free;
512 memset(new.rmap, 0, npages * sizeof(*new.rmap));
514 new.user_alloc = user_alloc;
516 * hva_to_rmmap() serialzies with the mmu_lock and to be
517 * safe it has to ignore memslots with !user_alloc &&
518 * !userspace_addr.
520 if (user_alloc)
521 new.userspace_addr = mem->userspace_addr;
522 else
523 new.userspace_addr = 0;
525 if (npages && !new.lpage_info) {
526 int largepages = npages / KVM_PAGES_PER_HPAGE;
527 if (npages % KVM_PAGES_PER_HPAGE)
528 largepages++;
529 if (base_gfn % KVM_PAGES_PER_HPAGE)
530 largepages++;
532 new.lpage_info = vmalloc(largepages * sizeof(*new.lpage_info));
534 if (!new.lpage_info)
535 goto out_free;
537 memset(new.lpage_info, 0, largepages * sizeof(*new.lpage_info));
539 if (base_gfn % KVM_PAGES_PER_HPAGE)
540 new.lpage_info[0].write_count = 1;
541 if ((base_gfn+npages) % KVM_PAGES_PER_HPAGE)
542 new.lpage_info[largepages-1].write_count = 1;
545 /* Allocate page dirty bitmap if needed */
546 if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
547 unsigned dirty_bytes = ALIGN(npages, BITS_PER_LONG) / 8;
549 new.dirty_bitmap = vmalloc(dirty_bytes);
550 if (!new.dirty_bitmap)
551 goto out_free;
552 memset(new.dirty_bitmap, 0, dirty_bytes);
554 #endif /* not defined CONFIG_S390 */
556 if (!npages)
557 kvm_arch_flush_shadow(kvm);
559 spin_lock(&kvm->mmu_lock);
560 if (mem->slot >= kvm->nmemslots)
561 kvm->nmemslots = mem->slot + 1;
563 *memslot = new;
564 spin_unlock(&kvm->mmu_lock);
566 r = kvm_arch_set_memory_region(kvm, mem, old, user_alloc);
567 if (r) {
568 spin_lock(&kvm->mmu_lock);
569 *memslot = old;
570 spin_unlock(&kvm->mmu_lock);
571 goto out_free;
574 kvm_free_physmem_slot(&old, &new);
575 return 0;
577 out_free:
578 kvm_free_physmem_slot(&new, &old);
579 out:
580 return r;
583 EXPORT_SYMBOL_GPL(__kvm_set_memory_region);
585 int kvm_set_memory_region(struct kvm *kvm,
586 struct kvm_userspace_memory_region *mem,
587 int user_alloc)
589 int r;
591 down_write(&kvm->slots_lock);
592 r = __kvm_set_memory_region(kvm, mem, user_alloc);
593 up_write(&kvm->slots_lock);
594 return r;
596 EXPORT_SYMBOL_GPL(kvm_set_memory_region);
598 int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
599 struct
600 kvm_userspace_memory_region *mem,
601 int user_alloc)
603 if (mem->slot >= KVM_MEMORY_SLOTS)
604 return -EINVAL;
605 return kvm_set_memory_region(kvm, mem, user_alloc);
608 int kvm_get_dirty_log(struct kvm *kvm,
609 struct kvm_dirty_log *log, int *is_dirty)
611 struct kvm_memory_slot *memslot;
612 int r, i;
613 int n;
614 unsigned long any = 0;
616 r = -EINVAL;
617 if (log->slot >= KVM_MEMORY_SLOTS)
618 goto out;
620 memslot = &kvm->memslots[log->slot];
621 r = -ENOENT;
622 if (!memslot->dirty_bitmap)
623 goto out;
625 n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
627 for (i = 0; !any && i < n/sizeof(long); ++i)
628 any = memslot->dirty_bitmap[i];
630 r = -EFAULT;
631 if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n))
632 goto out;
634 if (any)
635 *is_dirty = 1;
637 r = 0;
638 out:
639 return r;
642 int is_error_page(struct page *page)
644 return page == bad_page;
646 EXPORT_SYMBOL_GPL(is_error_page);
648 int is_error_pfn(pfn_t pfn)
650 return pfn == bad_pfn;
652 EXPORT_SYMBOL_GPL(is_error_pfn);
654 static inline unsigned long bad_hva(void)
656 return PAGE_OFFSET;
659 int kvm_is_error_hva(unsigned long addr)
661 return addr == bad_hva();
663 EXPORT_SYMBOL_GPL(kvm_is_error_hva);
665 static struct kvm_memory_slot *__gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
667 int i;
669 for (i = 0; i < kvm->nmemslots; ++i) {
670 struct kvm_memory_slot *memslot = &kvm->memslots[i];
672 if (gfn >= memslot->base_gfn
673 && gfn < memslot->base_gfn + memslot->npages)
674 return memslot;
676 return NULL;
679 struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
681 gfn = unalias_gfn(kvm, gfn);
682 return __gfn_to_memslot(kvm, gfn);
685 int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
687 int i;
689 gfn = unalias_gfn(kvm, gfn);
690 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
691 struct kvm_memory_slot *memslot = &kvm->memslots[i];
693 if (gfn >= memslot->base_gfn
694 && gfn < memslot->base_gfn + memslot->npages)
695 return 1;
697 return 0;
699 EXPORT_SYMBOL_GPL(kvm_is_visible_gfn);
701 unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
703 struct kvm_memory_slot *slot;
705 gfn = unalias_gfn(kvm, gfn);
706 slot = __gfn_to_memslot(kvm, gfn);
707 if (!slot)
708 return bad_hva();
709 return (slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE);
711 EXPORT_SYMBOL_GPL(gfn_to_hva);
714 * Requires current->mm->mmap_sem to be held
716 pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
718 struct page *page[1];
719 unsigned long addr;
720 int npages;
721 pfn_t pfn;
723 might_sleep();
725 addr = gfn_to_hva(kvm, gfn);
726 if (kvm_is_error_hva(addr)) {
727 get_page(bad_page);
728 return page_to_pfn(bad_page);
731 npages = get_user_pages(current, current->mm, addr, 1, 1, 0, page,
732 NULL);
734 if (unlikely(npages != 1)) {
735 struct vm_area_struct *vma;
737 vma = find_vma(current->mm, addr);
738 if (vma == NULL || addr < vma->vm_start ||
739 !(vma->vm_flags & VM_PFNMAP)) {
740 get_page(bad_page);
741 return page_to_pfn(bad_page);
744 pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
745 BUG_ON(pfn_valid(pfn));
746 } else
747 pfn = page_to_pfn(page[0]);
749 return pfn;
752 EXPORT_SYMBOL_GPL(gfn_to_pfn);
754 struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
756 pfn_t pfn;
758 pfn = gfn_to_pfn(kvm, gfn);
759 if (pfn_valid(pfn))
760 return pfn_to_page(pfn);
762 WARN_ON(!pfn_valid(pfn));
764 get_page(bad_page);
765 return bad_page;
768 EXPORT_SYMBOL_GPL(gfn_to_page);
770 void kvm_release_page_clean(struct page *page)
772 kvm_release_pfn_clean(page_to_pfn(page));
774 EXPORT_SYMBOL_GPL(kvm_release_page_clean);
776 void kvm_release_pfn_clean(pfn_t pfn)
778 if (pfn_valid(pfn))
779 put_page(pfn_to_page(pfn));
781 EXPORT_SYMBOL_GPL(kvm_release_pfn_clean);
783 void kvm_release_page_dirty(struct page *page)
785 kvm_release_pfn_dirty(page_to_pfn(page));
787 EXPORT_SYMBOL_GPL(kvm_release_page_dirty);
789 void kvm_release_pfn_dirty(pfn_t pfn)
791 kvm_set_pfn_dirty(pfn);
792 kvm_release_pfn_clean(pfn);
794 EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty);
796 void kvm_set_page_dirty(struct page *page)
798 kvm_set_pfn_dirty(page_to_pfn(page));
800 EXPORT_SYMBOL_GPL(kvm_set_page_dirty);
802 void kvm_set_pfn_dirty(pfn_t pfn)
804 if (pfn_valid(pfn)) {
805 struct page *page = pfn_to_page(pfn);
806 if (!PageReserved(page))
807 SetPageDirty(page);
810 EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty);
812 void kvm_set_pfn_accessed(pfn_t pfn)
814 if (pfn_valid(pfn))
815 mark_page_accessed(pfn_to_page(pfn));
817 EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed);
819 void kvm_get_pfn(pfn_t pfn)
821 if (pfn_valid(pfn))
822 get_page(pfn_to_page(pfn));
824 EXPORT_SYMBOL_GPL(kvm_get_pfn);
826 static int next_segment(unsigned long len, int offset)
828 if (len > PAGE_SIZE - offset)
829 return PAGE_SIZE - offset;
830 else
831 return len;
834 int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
835 int len)
837 int r;
838 unsigned long addr;
840 addr = gfn_to_hva(kvm, gfn);
841 if (kvm_is_error_hva(addr))
842 return -EFAULT;
843 r = copy_from_user(data, (void __user *)addr + offset, len);
844 if (r)
845 return -EFAULT;
846 return 0;
848 EXPORT_SYMBOL_GPL(kvm_read_guest_page);
850 int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len)
852 gfn_t gfn = gpa >> PAGE_SHIFT;
853 int seg;
854 int offset = offset_in_page(gpa);
855 int ret;
857 while ((seg = next_segment(len, offset)) != 0) {
858 ret = kvm_read_guest_page(kvm, gfn, data, offset, seg);
859 if (ret < 0)
860 return ret;
861 offset = 0;
862 len -= seg;
863 data += seg;
864 ++gfn;
866 return 0;
868 EXPORT_SYMBOL_GPL(kvm_read_guest);
870 int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data,
871 unsigned long len)
873 int r;
874 unsigned long addr;
875 gfn_t gfn = gpa >> PAGE_SHIFT;
876 int offset = offset_in_page(gpa);
878 addr = gfn_to_hva(kvm, gfn);
879 if (kvm_is_error_hva(addr))
880 return -EFAULT;
881 pagefault_disable();
882 r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len);
883 pagefault_enable();
884 if (r)
885 return -EFAULT;
886 return 0;
888 EXPORT_SYMBOL(kvm_read_guest_atomic);
890 int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data,
891 int offset, int len)
893 int r;
894 unsigned long addr;
896 addr = gfn_to_hva(kvm, gfn);
897 if (kvm_is_error_hva(addr))
898 return -EFAULT;
899 r = copy_to_user((void __user *)addr + offset, data, len);
900 if (r)
901 return -EFAULT;
902 mark_page_dirty(kvm, gfn);
903 return 0;
905 EXPORT_SYMBOL_GPL(kvm_write_guest_page);
907 int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
908 unsigned long len)
910 gfn_t gfn = gpa >> PAGE_SHIFT;
911 int seg;
912 int offset = offset_in_page(gpa);
913 int ret;
915 while ((seg = next_segment(len, offset)) != 0) {
916 ret = kvm_write_guest_page(kvm, gfn, data, offset, seg);
917 if (ret < 0)
918 return ret;
919 offset = 0;
920 len -= seg;
921 data += seg;
922 ++gfn;
924 return 0;
927 int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len)
929 return kvm_write_guest_page(kvm, gfn, empty_zero_page, offset, len);
931 EXPORT_SYMBOL_GPL(kvm_clear_guest_page);
933 int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len)
935 gfn_t gfn = gpa >> PAGE_SHIFT;
936 int seg;
937 int offset = offset_in_page(gpa);
938 int ret;
940 while ((seg = next_segment(len, offset)) != 0) {
941 ret = kvm_clear_guest_page(kvm, gfn, offset, seg);
942 if (ret < 0)
943 return ret;
944 offset = 0;
945 len -= seg;
946 ++gfn;
948 return 0;
950 EXPORT_SYMBOL_GPL(kvm_clear_guest);
952 void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
954 struct kvm_memory_slot *memslot;
956 gfn = unalias_gfn(kvm, gfn);
957 memslot = __gfn_to_memslot(kvm, gfn);
958 if (memslot && memslot->dirty_bitmap) {
959 unsigned long rel_gfn = gfn - memslot->base_gfn;
961 /* avoid RMW */
962 if (!test_bit(rel_gfn, memslot->dirty_bitmap))
963 set_bit(rel_gfn, memslot->dirty_bitmap);
968 * The vCPU has executed a HLT instruction with in-kernel mode enabled.
970 void kvm_vcpu_block(struct kvm_vcpu *vcpu)
972 DEFINE_WAIT(wait);
974 for (;;) {
975 prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
977 if (kvm_cpu_has_interrupt(vcpu))
978 break;
979 if (kvm_cpu_has_pending_timer(vcpu))
980 break;
981 if (kvm_arch_vcpu_runnable(vcpu))
982 break;
983 if (signal_pending(current))
984 break;
986 vcpu_put(vcpu);
987 schedule();
988 vcpu_load(vcpu);
991 finish_wait(&vcpu->wq, &wait);
994 void kvm_resched(struct kvm_vcpu *vcpu)
996 if (!need_resched())
997 return;
998 cond_resched();
1000 EXPORT_SYMBOL_GPL(kvm_resched);
1002 static int kvm_vcpu_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1004 struct kvm_vcpu *vcpu = vma->vm_file->private_data;
1005 struct page *page;
1007 if (vmf->pgoff == 0)
1008 page = virt_to_page(vcpu->run);
1009 #ifdef CONFIG_X86
1010 else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET)
1011 page = virt_to_page(vcpu->arch.pio_data);
1012 #endif
1013 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
1014 else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET)
1015 page = virt_to_page(vcpu->kvm->coalesced_mmio_ring);
1016 #endif
1017 else
1018 return VM_FAULT_SIGBUS;
1019 get_page(page);
1020 vmf->page = page;
1021 return 0;
1024 static struct vm_operations_struct kvm_vcpu_vm_ops = {
1025 .fault = kvm_vcpu_fault,
1028 static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
1030 vma->vm_ops = &kvm_vcpu_vm_ops;
1031 return 0;
1034 static int kvm_vcpu_release(struct inode *inode, struct file *filp)
1036 struct kvm_vcpu *vcpu = filp->private_data;
1038 kvm_put_kvm(vcpu->kvm);
1039 return 0;
1042 static const struct file_operations kvm_vcpu_fops = {
1043 .release = kvm_vcpu_release,
1044 .unlocked_ioctl = kvm_vcpu_ioctl,
1045 .compat_ioctl = kvm_vcpu_ioctl,
1046 .mmap = kvm_vcpu_mmap,
1050 * Allocates an inode for the vcpu.
1052 static int create_vcpu_fd(struct kvm_vcpu *vcpu)
1054 int fd = anon_inode_getfd("kvm-vcpu", &kvm_vcpu_fops, vcpu, 0);
1055 if (fd < 0)
1056 kvm_put_kvm(vcpu->kvm);
1057 return fd;
1061 * Creates some virtual cpus. Good luck creating more than one.
1063 static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n)
1065 int r;
1066 struct kvm_vcpu *vcpu;
1068 if (!valid_vcpu(n))
1069 return -EINVAL;
1071 vcpu = kvm_arch_vcpu_create(kvm, n);
1072 if (IS_ERR(vcpu))
1073 return PTR_ERR(vcpu);
1075 preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
1077 r = kvm_arch_vcpu_setup(vcpu);
1078 if (r)
1079 return r;
1081 mutex_lock(&kvm->lock);
1082 if (kvm->vcpus[n]) {
1083 r = -EEXIST;
1084 goto vcpu_destroy;
1086 kvm->vcpus[n] = vcpu;
1087 mutex_unlock(&kvm->lock);
1089 /* Now it's all set up, let userspace reach it */
1090 kvm_get_kvm(kvm);
1091 r = create_vcpu_fd(vcpu);
1092 if (r < 0)
1093 goto unlink;
1094 return r;
1096 unlink:
1097 mutex_lock(&kvm->lock);
1098 kvm->vcpus[n] = NULL;
1099 vcpu_destroy:
1100 mutex_unlock(&kvm->lock);
1101 kvm_arch_vcpu_destroy(vcpu);
1102 return r;
1105 static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset)
1107 if (sigset) {
1108 sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP));
1109 vcpu->sigset_active = 1;
1110 vcpu->sigset = *sigset;
1111 } else
1112 vcpu->sigset_active = 0;
1113 return 0;
1116 static long kvm_vcpu_ioctl(struct file *filp,
1117 unsigned int ioctl, unsigned long arg)
1119 struct kvm_vcpu *vcpu = filp->private_data;
1120 void __user *argp = (void __user *)arg;
1121 int r;
1122 struct kvm_fpu *fpu = NULL;
1123 struct kvm_sregs *kvm_sregs = NULL;
1125 if (vcpu->kvm->mm != current->mm)
1126 return -EIO;
1127 switch (ioctl) {
1128 case KVM_RUN:
1129 r = -EINVAL;
1130 if (arg)
1131 goto out;
1132 r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run);
1133 break;
1134 case KVM_GET_REGS: {
1135 struct kvm_regs *kvm_regs;
1137 r = -ENOMEM;
1138 kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL);
1139 if (!kvm_regs)
1140 goto out;
1141 r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs);
1142 if (r)
1143 goto out_free1;
1144 r = -EFAULT;
1145 if (copy_to_user(argp, kvm_regs, sizeof(struct kvm_regs)))
1146 goto out_free1;
1147 r = 0;
1148 out_free1:
1149 kfree(kvm_regs);
1150 break;
1152 case KVM_SET_REGS: {
1153 struct kvm_regs *kvm_regs;
1155 r = -ENOMEM;
1156 kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL);
1157 if (!kvm_regs)
1158 goto out;
1159 r = -EFAULT;
1160 if (copy_from_user(kvm_regs, argp, sizeof(struct kvm_regs)))
1161 goto out_free2;
1162 r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs);
1163 if (r)
1164 goto out_free2;
1165 r = 0;
1166 out_free2:
1167 kfree(kvm_regs);
1168 break;
1170 case KVM_GET_SREGS: {
1171 kvm_sregs = kzalloc(sizeof(struct kvm_sregs), GFP_KERNEL);
1172 r = -ENOMEM;
1173 if (!kvm_sregs)
1174 goto out;
1175 r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, kvm_sregs);
1176 if (r)
1177 goto out;
1178 r = -EFAULT;
1179 if (copy_to_user(argp, kvm_sregs, sizeof(struct kvm_sregs)))
1180 goto out;
1181 r = 0;
1182 break;
1184 case KVM_SET_SREGS: {
1185 kvm_sregs = kmalloc(sizeof(struct kvm_sregs), GFP_KERNEL);
1186 r = -ENOMEM;
1187 if (!kvm_sregs)
1188 goto out;
1189 r = -EFAULT;
1190 if (copy_from_user(kvm_sregs, argp, sizeof(struct kvm_sregs)))
1191 goto out;
1192 r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs);
1193 if (r)
1194 goto out;
1195 r = 0;
1196 break;
1198 case KVM_GET_MP_STATE: {
1199 struct kvm_mp_state mp_state;
1201 r = kvm_arch_vcpu_ioctl_get_mpstate(vcpu, &mp_state);
1202 if (r)
1203 goto out;
1204 r = -EFAULT;
1205 if (copy_to_user(argp, &mp_state, sizeof mp_state))
1206 goto out;
1207 r = 0;
1208 break;
1210 case KVM_SET_MP_STATE: {
1211 struct kvm_mp_state mp_state;
1213 r = -EFAULT;
1214 if (copy_from_user(&mp_state, argp, sizeof mp_state))
1215 goto out;
1216 r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state);
1217 if (r)
1218 goto out;
1219 r = 0;
1220 break;
1222 case KVM_TRANSLATE: {
1223 struct kvm_translation tr;
1225 r = -EFAULT;
1226 if (copy_from_user(&tr, argp, sizeof tr))
1227 goto out;
1228 r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr);
1229 if (r)
1230 goto out;
1231 r = -EFAULT;
1232 if (copy_to_user(argp, &tr, sizeof tr))
1233 goto out;
1234 r = 0;
1235 break;
1237 case KVM_DEBUG_GUEST: {
1238 struct kvm_debug_guest dbg;
1240 r = -EFAULT;
1241 if (copy_from_user(&dbg, argp, sizeof dbg))
1242 goto out;
1243 r = kvm_arch_vcpu_ioctl_debug_guest(vcpu, &dbg);
1244 if (r)
1245 goto out;
1246 r = 0;
1247 break;
1249 case KVM_SET_SIGNAL_MASK: {
1250 struct kvm_signal_mask __user *sigmask_arg = argp;
1251 struct kvm_signal_mask kvm_sigmask;
1252 sigset_t sigset, *p;
1254 p = NULL;
1255 if (argp) {
1256 r = -EFAULT;
1257 if (copy_from_user(&kvm_sigmask, argp,
1258 sizeof kvm_sigmask))
1259 goto out;
1260 r = -EINVAL;
1261 if (kvm_sigmask.len != sizeof sigset)
1262 goto out;
1263 r = -EFAULT;
1264 if (copy_from_user(&sigset, sigmask_arg->sigset,
1265 sizeof sigset))
1266 goto out;
1267 p = &sigset;
1269 r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
1270 break;
1272 case KVM_GET_FPU: {
1273 fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL);
1274 r = -ENOMEM;
1275 if (!fpu)
1276 goto out;
1277 r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, fpu);
1278 if (r)
1279 goto out;
1280 r = -EFAULT;
1281 if (copy_to_user(argp, fpu, sizeof(struct kvm_fpu)))
1282 goto out;
1283 r = 0;
1284 break;
1286 case KVM_SET_FPU: {
1287 fpu = kmalloc(sizeof(struct kvm_fpu), GFP_KERNEL);
1288 r = -ENOMEM;
1289 if (!fpu)
1290 goto out;
1291 r = -EFAULT;
1292 if (copy_from_user(fpu, argp, sizeof(struct kvm_fpu)))
1293 goto out;
1294 r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu);
1295 if (r)
1296 goto out;
1297 r = 0;
1298 break;
1300 default:
1301 r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
1303 out:
1304 kfree(fpu);
1305 kfree(kvm_sregs);
1306 return r;
1309 static long kvm_vm_ioctl(struct file *filp,
1310 unsigned int ioctl, unsigned long arg)
1312 struct kvm *kvm = filp->private_data;
1313 void __user *argp = (void __user *)arg;
1314 int r;
1316 if (kvm->mm != current->mm)
1317 return -EIO;
1318 switch (ioctl) {
1319 case KVM_CREATE_VCPU:
1320 r = kvm_vm_ioctl_create_vcpu(kvm, arg);
1321 if (r < 0)
1322 goto out;
1323 break;
1324 case KVM_SET_USER_MEMORY_REGION: {
1325 struct kvm_userspace_memory_region kvm_userspace_mem;
1327 r = -EFAULT;
1328 if (copy_from_user(&kvm_userspace_mem, argp,
1329 sizeof kvm_userspace_mem))
1330 goto out;
1332 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 1);
1333 if (r)
1334 goto out;
1335 break;
1337 case KVM_GET_DIRTY_LOG: {
1338 struct kvm_dirty_log log;
1340 r = -EFAULT;
1341 if (copy_from_user(&log, argp, sizeof log))
1342 goto out;
1343 r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
1344 if (r)
1345 goto out;
1346 break;
1348 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
1349 case KVM_REGISTER_COALESCED_MMIO: {
1350 struct kvm_coalesced_mmio_zone zone;
1351 r = -EFAULT;
1352 if (copy_from_user(&zone, argp, sizeof zone))
1353 goto out;
1354 r = -ENXIO;
1355 r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone);
1356 if (r)
1357 goto out;
1358 r = 0;
1359 break;
1361 case KVM_UNREGISTER_COALESCED_MMIO: {
1362 struct kvm_coalesced_mmio_zone zone;
1363 r = -EFAULT;
1364 if (copy_from_user(&zone, argp, sizeof zone))
1365 goto out;
1366 r = -ENXIO;
1367 r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone);
1368 if (r)
1369 goto out;
1370 r = 0;
1371 break;
1373 #endif
1374 default:
1375 r = kvm_arch_vm_ioctl(filp, ioctl, arg);
1377 out:
1378 return r;
1381 static int kvm_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1383 struct kvm *kvm = vma->vm_file->private_data;
1384 struct page *page;
1386 if (!kvm_is_visible_gfn(kvm, vmf->pgoff))
1387 return VM_FAULT_SIGBUS;
1388 page = gfn_to_page(kvm, vmf->pgoff);
1389 if (is_error_page(page)) {
1390 kvm_release_page_clean(page);
1391 return VM_FAULT_SIGBUS;
1393 vmf->page = page;
1394 return 0;
1397 static struct vm_operations_struct kvm_vm_vm_ops = {
1398 .fault = kvm_vm_fault,
1401 static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma)
1403 vma->vm_ops = &kvm_vm_vm_ops;
1404 return 0;
1407 static const struct file_operations kvm_vm_fops = {
1408 .release = kvm_vm_release,
1409 .unlocked_ioctl = kvm_vm_ioctl,
1410 .compat_ioctl = kvm_vm_ioctl,
1411 .mmap = kvm_vm_mmap,
1414 static int kvm_dev_ioctl_create_vm(void)
1416 int fd;
1417 struct kvm *kvm;
1419 kvm = kvm_create_vm();
1420 if (IS_ERR(kvm))
1421 return PTR_ERR(kvm);
1422 fd = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, 0);
1423 if (fd < 0)
1424 kvm_put_kvm(kvm);
1426 return fd;
1429 static long kvm_dev_ioctl(struct file *filp,
1430 unsigned int ioctl, unsigned long arg)
1432 long r = -EINVAL;
1434 switch (ioctl) {
1435 case KVM_GET_API_VERSION:
1436 r = -EINVAL;
1437 if (arg)
1438 goto out;
1439 r = KVM_API_VERSION;
1440 break;
1441 case KVM_CREATE_VM:
1442 r = -EINVAL;
1443 if (arg)
1444 goto out;
1445 r = kvm_dev_ioctl_create_vm();
1446 break;
1447 case KVM_CHECK_EXTENSION:
1448 r = kvm_dev_ioctl_check_extension(arg);
1449 break;
1450 case KVM_GET_VCPU_MMAP_SIZE:
1451 r = -EINVAL;
1452 if (arg)
1453 goto out;
1454 r = PAGE_SIZE; /* struct kvm_run */
1455 #ifdef CONFIG_X86
1456 r += PAGE_SIZE; /* pio data page */
1457 #endif
1458 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
1459 r += PAGE_SIZE; /* coalesced mmio ring page */
1460 #endif
1461 break;
1462 case KVM_TRACE_ENABLE:
1463 case KVM_TRACE_PAUSE:
1464 case KVM_TRACE_DISABLE:
1465 r = kvm_trace_ioctl(ioctl, arg);
1466 break;
1467 default:
1468 return kvm_arch_dev_ioctl(filp, ioctl, arg);
1470 out:
1471 return r;
1474 static struct file_operations kvm_chardev_ops = {
1475 .unlocked_ioctl = kvm_dev_ioctl,
1476 .compat_ioctl = kvm_dev_ioctl,
1479 static struct miscdevice kvm_dev = {
1480 KVM_MINOR,
1481 "kvm",
1482 &kvm_chardev_ops,
1485 static void hardware_enable(void *junk)
1487 int cpu = raw_smp_processor_id();
1489 if (cpu_isset(cpu, cpus_hardware_enabled))
1490 return;
1491 cpu_set(cpu, cpus_hardware_enabled);
1492 kvm_arch_hardware_enable(NULL);
1495 static void hardware_disable(void *junk)
1497 int cpu = raw_smp_processor_id();
1499 if (!cpu_isset(cpu, cpus_hardware_enabled))
1500 return;
1501 cpu_clear(cpu, cpus_hardware_enabled);
1502 kvm_arch_hardware_disable(NULL);
1505 static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val,
1506 void *v)
1508 int cpu = (long)v;
1510 val &= ~CPU_TASKS_FROZEN;
1511 switch (val) {
1512 case CPU_DYING:
1513 printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
1514 cpu);
1515 hardware_disable(NULL);
1516 break;
1517 case CPU_UP_CANCELED:
1518 printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
1519 cpu);
1520 smp_call_function_single(cpu, hardware_disable, NULL, 1);
1521 break;
1522 case CPU_ONLINE:
1523 printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n",
1524 cpu);
1525 smp_call_function_single(cpu, hardware_enable, NULL, 1);
1526 break;
1528 return NOTIFY_OK;
1532 asmlinkage void kvm_handle_fault_on_reboot(void)
1534 if (kvm_rebooting)
1535 /* spin while reset goes on */
1536 while (true)
1538 /* Fault while not rebooting. We want the trace. */
1539 BUG();
1541 EXPORT_SYMBOL_GPL(kvm_handle_fault_on_reboot);
1543 static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
1544 void *v)
1546 if (val == SYS_RESTART) {
1548 * Some (well, at least mine) BIOSes hang on reboot if
1549 * in vmx root mode.
1551 printk(KERN_INFO "kvm: exiting hardware virtualization\n");
1552 kvm_rebooting = true;
1553 on_each_cpu(hardware_disable, NULL, 1);
1555 return NOTIFY_OK;
1558 static struct notifier_block kvm_reboot_notifier = {
1559 .notifier_call = kvm_reboot,
1560 .priority = 0,
1563 void kvm_io_bus_init(struct kvm_io_bus *bus)
1565 memset(bus, 0, sizeof(*bus));
1568 void kvm_io_bus_destroy(struct kvm_io_bus *bus)
1570 int i;
1572 for (i = 0; i < bus->dev_count; i++) {
1573 struct kvm_io_device *pos = bus->devs[i];
1575 kvm_iodevice_destructor(pos);
1579 struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus,
1580 gpa_t addr, int len, int is_write)
1582 int i;
1584 for (i = 0; i < bus->dev_count; i++) {
1585 struct kvm_io_device *pos = bus->devs[i];
1587 if (pos->in_range(pos, addr, len, is_write))
1588 return pos;
1591 return NULL;
1594 void kvm_io_bus_register_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev)
1596 BUG_ON(bus->dev_count > (NR_IOBUS_DEVS-1));
1598 bus->devs[bus->dev_count++] = dev;
1601 static struct notifier_block kvm_cpu_notifier = {
1602 .notifier_call = kvm_cpu_hotplug,
1603 .priority = 20, /* must be > scheduler priority */
1606 static int vm_stat_get(void *_offset, u64 *val)
1608 unsigned offset = (long)_offset;
1609 struct kvm *kvm;
1611 *val = 0;
1612 spin_lock(&kvm_lock);
1613 list_for_each_entry(kvm, &vm_list, vm_list)
1614 *val += *(u32 *)((void *)kvm + offset);
1615 spin_unlock(&kvm_lock);
1616 return 0;
1619 DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, NULL, "%llu\n");
1621 static int vcpu_stat_get(void *_offset, u64 *val)
1623 unsigned offset = (long)_offset;
1624 struct kvm *kvm;
1625 struct kvm_vcpu *vcpu;
1626 int i;
1628 *val = 0;
1629 spin_lock(&kvm_lock);
1630 list_for_each_entry(kvm, &vm_list, vm_list)
1631 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
1632 vcpu = kvm->vcpus[i];
1633 if (vcpu)
1634 *val += *(u32 *)((void *)vcpu + offset);
1636 spin_unlock(&kvm_lock);
1637 return 0;
1640 DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, NULL, "%llu\n");
1642 static struct file_operations *stat_fops[] = {
1643 [KVM_STAT_VCPU] = &vcpu_stat_fops,
1644 [KVM_STAT_VM] = &vm_stat_fops,
1647 static void kvm_init_debug(void)
1649 struct kvm_stats_debugfs_item *p;
1651 kvm_debugfs_dir = debugfs_create_dir("kvm", NULL);
1652 for (p = debugfs_entries; p->name; ++p)
1653 p->dentry = debugfs_create_file(p->name, 0444, kvm_debugfs_dir,
1654 (void *)(long)p->offset,
1655 stat_fops[p->kind]);
1658 static void kvm_exit_debug(void)
1660 struct kvm_stats_debugfs_item *p;
1662 for (p = debugfs_entries; p->name; ++p)
1663 debugfs_remove(p->dentry);
1664 debugfs_remove(kvm_debugfs_dir);
1667 static int kvm_suspend(struct sys_device *dev, pm_message_t state)
1669 hardware_disable(NULL);
1670 return 0;
1673 static int kvm_resume(struct sys_device *dev)
1675 hardware_enable(NULL);
1676 return 0;
1679 static struct sysdev_class kvm_sysdev_class = {
1680 .name = "kvm",
1681 .suspend = kvm_suspend,
1682 .resume = kvm_resume,
1685 static struct sys_device kvm_sysdev = {
1686 .id = 0,
1687 .cls = &kvm_sysdev_class,
1690 struct page *bad_page;
1691 pfn_t bad_pfn;
1693 static inline
1694 struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
1696 return container_of(pn, struct kvm_vcpu, preempt_notifier);
1699 static void kvm_sched_in(struct preempt_notifier *pn, int cpu)
1701 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
1703 kvm_arch_vcpu_load(vcpu, cpu);
1706 static void kvm_sched_out(struct preempt_notifier *pn,
1707 struct task_struct *next)
1709 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
1711 kvm_arch_vcpu_put(vcpu);
1714 int kvm_init(void *opaque, unsigned int vcpu_size,
1715 struct module *module)
1717 int r;
1718 int cpu;
1720 kvm_init_debug();
1722 r = kvm_arch_init(opaque);
1723 if (r)
1724 goto out_fail;
1726 bad_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
1728 if (bad_page == NULL) {
1729 r = -ENOMEM;
1730 goto out;
1733 bad_pfn = page_to_pfn(bad_page);
1735 r = kvm_arch_hardware_setup();
1736 if (r < 0)
1737 goto out_free_0;
1739 for_each_online_cpu(cpu) {
1740 smp_call_function_single(cpu,
1741 kvm_arch_check_processor_compat,
1742 &r, 1);
1743 if (r < 0)
1744 goto out_free_1;
1747 on_each_cpu(hardware_enable, NULL, 1);
1748 r = register_cpu_notifier(&kvm_cpu_notifier);
1749 if (r)
1750 goto out_free_2;
1751 register_reboot_notifier(&kvm_reboot_notifier);
1753 r = sysdev_class_register(&kvm_sysdev_class);
1754 if (r)
1755 goto out_free_3;
1757 r = sysdev_register(&kvm_sysdev);
1758 if (r)
1759 goto out_free_4;
1761 /* A kmem cache lets us meet the alignment requirements of fx_save. */
1762 kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size,
1763 __alignof__(struct kvm_vcpu),
1764 0, NULL);
1765 if (!kvm_vcpu_cache) {
1766 r = -ENOMEM;
1767 goto out_free_5;
1770 kvm_chardev_ops.owner = module;
1772 r = misc_register(&kvm_dev);
1773 if (r) {
1774 printk(KERN_ERR "kvm: misc device register failed\n");
1775 goto out_free;
1778 kvm_preempt_ops.sched_in = kvm_sched_in;
1779 kvm_preempt_ops.sched_out = kvm_sched_out;
1781 return 0;
1783 out_free:
1784 kmem_cache_destroy(kvm_vcpu_cache);
1785 out_free_5:
1786 sysdev_unregister(&kvm_sysdev);
1787 out_free_4:
1788 sysdev_class_unregister(&kvm_sysdev_class);
1789 out_free_3:
1790 unregister_reboot_notifier(&kvm_reboot_notifier);
1791 unregister_cpu_notifier(&kvm_cpu_notifier);
1792 out_free_2:
1793 on_each_cpu(hardware_disable, NULL, 1);
1794 out_free_1:
1795 kvm_arch_hardware_unsetup();
1796 out_free_0:
1797 __free_page(bad_page);
1798 out:
1799 kvm_arch_exit();
1800 kvm_exit_debug();
1801 out_fail:
1802 return r;
1804 EXPORT_SYMBOL_GPL(kvm_init);
1806 void kvm_exit(void)
1808 kvm_trace_cleanup();
1809 misc_deregister(&kvm_dev);
1810 kmem_cache_destroy(kvm_vcpu_cache);
1811 sysdev_unregister(&kvm_sysdev);
1812 sysdev_class_unregister(&kvm_sysdev_class);
1813 unregister_reboot_notifier(&kvm_reboot_notifier);
1814 unregister_cpu_notifier(&kvm_cpu_notifier);
1815 on_each_cpu(hardware_disable, NULL, 1);
1816 kvm_arch_hardware_unsetup();
1817 kvm_arch_exit();
1818 kvm_exit_debug();
1819 __free_page(bad_page);
1821 EXPORT_SYMBOL_GPL(kvm_exit);