3 #include "kvm/boot-protocol.h"
4 #include "kvm/cpufeature.h"
5 #include "kvm/read-write.h"
6 #include "kvm/interrupt.h"
7 #include "kvm/mptable.h"
10 #include "kvm/kvm-cpu.h"
12 #include <linux/kvm.h>
14 #include <asm/bootparam.h>
16 #include <sys/ioctl.h>
30 #include <sys/eventfd.h>
31 #include <asm/unistd.h>
34 #define DEFINE_KVM_EXIT_REASON(reason) [reason] = #reason
35 #define KVM_PID_FILE_PATH "/.kvm-tools/"
36 #define HOME_DIR getenv("HOME")
38 const char *kvm_exit_reasons
[] = {
39 DEFINE_KVM_EXIT_REASON(KVM_EXIT_UNKNOWN
),
40 DEFINE_KVM_EXIT_REASON(KVM_EXIT_EXCEPTION
),
41 DEFINE_KVM_EXIT_REASON(KVM_EXIT_IO
),
42 DEFINE_KVM_EXIT_REASON(KVM_EXIT_HYPERCALL
),
43 DEFINE_KVM_EXIT_REASON(KVM_EXIT_DEBUG
),
44 DEFINE_KVM_EXIT_REASON(KVM_EXIT_HLT
),
45 DEFINE_KVM_EXIT_REASON(KVM_EXIT_MMIO
),
46 DEFINE_KVM_EXIT_REASON(KVM_EXIT_IRQ_WINDOW_OPEN
),
47 DEFINE_KVM_EXIT_REASON(KVM_EXIT_SHUTDOWN
),
48 DEFINE_KVM_EXIT_REASON(KVM_EXIT_FAIL_ENTRY
),
49 DEFINE_KVM_EXIT_REASON(KVM_EXIT_INTR
),
50 DEFINE_KVM_EXIT_REASON(KVM_EXIT_SET_TPR
),
51 DEFINE_KVM_EXIT_REASON(KVM_EXIT_TPR_ACCESS
),
52 DEFINE_KVM_EXIT_REASON(KVM_EXIT_S390_SIEIC
),
53 DEFINE_KVM_EXIT_REASON(KVM_EXIT_S390_RESET
),
54 DEFINE_KVM_EXIT_REASON(KVM_EXIT_DCR
),
55 DEFINE_KVM_EXIT_REASON(KVM_EXIT_NMI
),
56 DEFINE_KVM_EXIT_REASON(KVM_EXIT_INTERNAL_ERROR
),
59 #define DEFINE_KVM_EXT(ext) \
67 { DEFINE_KVM_EXT(KVM_CAP_COALESCED_MMIO
) },
68 { DEFINE_KVM_EXT(KVM_CAP_SET_TSS_ADDR
) },
69 { DEFINE_KVM_EXT(KVM_CAP_PIT2
) },
70 { DEFINE_KVM_EXT(KVM_CAP_USER_MEMORY
) },
71 { DEFINE_KVM_EXT(KVM_CAP_IRQ_ROUTING
) },
72 { DEFINE_KVM_EXT(KVM_CAP_IRQCHIP
) },
73 { DEFINE_KVM_EXT(KVM_CAP_HLT
) },
74 { DEFINE_KVM_EXT(KVM_CAP_IRQ_INJECT_STATUS
) },
75 { DEFINE_KVM_EXT(KVM_CAP_EXT_CPUID
) },
78 extern struct kvm
*kvm
;
79 extern struct kvm_cpu
*kvm_cpus
[KVM_NR_CPUS
];
80 static int pause_event
;
81 static DEFINE_MUTEX(pause_lock
);
83 static bool kvm__supports_extension(struct kvm
*kvm
, unsigned int extension
)
87 ret
= ioctl(kvm
->sys_fd
, KVM_CHECK_EXTENSION
, extension
);
94 static int kvm__check_extensions(struct kvm
*kvm
)
98 for (i
= 0; i
< ARRAY_SIZE(kvm_req_ext
); i
++) {
99 if (!kvm__supports_extension(kvm
, kvm_req_ext
[i
].code
)) {
100 pr_error("Unsuppored KVM extension detected: %s",
101 kvm_req_ext
[i
].name
);
109 static struct kvm
*kvm__new(void)
111 struct kvm
*kvm
= calloc(1, sizeof *kvm
);
114 die("out of memory");
119 static void kvm__create_pidfile(struct kvm
*kvm
)
122 char full_name
[PATH_MAX
], pid
[10];
127 sprintf(full_name
, "%s/%s", HOME_DIR
, KVM_PID_FILE_PATH
);
128 mkdir(full_name
, 0777);
129 sprintf(full_name
, "%s/%s/%s.pid", HOME_DIR
, KVM_PID_FILE_PATH
, kvm
->name
);
130 fd
= open(full_name
, O_CREAT
| O_WRONLY
, 0666);
131 sprintf(pid
, "%u\n", getpid());
132 if (write(fd
, pid
, strlen(pid
)) <= 0)
133 die("Failed creating PID file");
137 void kvm__remove_pidfile(const char *name
)
139 char full_name
[PATH_MAX
];
141 sprintf(full_name
, "%s/%s/%s.pid", HOME_DIR
, KVM_PID_FILE_PATH
, name
);
145 int kvm__get_pid_by_instance(const char *name
)
148 char pid_str
[10], pid_file
[PATH_MAX
];
150 sprintf(pid_file
, "%s/%s/%s.pid", HOME_DIR
, KVM_PID_FILE_PATH
, name
);
151 fd
= open(pid_file
, O_RDONLY
);
155 if (read(fd
, pid_str
, 10) == 0)
167 int kvm__enumerate_instances(int (*callback
)(const char *name
, int pid
))
169 char full_name
[PATH_MAX
];
172 struct dirent entry
, *result
;
175 sprintf(full_name
, "%s/%s", HOME_DIR
, KVM_PID_FILE_PATH
);
176 dir
= opendir(full_name
);
178 while (dir
!= NULL
) {
179 readdir_r(dir
, &entry
, &result
);
182 if (entry
.d_type
== DT_REG
) {
183 entry
.d_name
[strlen(entry
.d_name
)-4] = 0;
184 pid
= kvm__get_pid_by_instance(entry
.d_name
);
185 ret
= callback(entry
.d_name
, pid
);
196 void kvm__delete(struct kvm
*kvm
)
198 kvm__stop_timer(kvm
);
200 munmap(kvm
->ram_start
, kvm
->ram_size
);
201 kvm__remove_pidfile(kvm
->name
);
205 static bool kvm__cpu_supports_vm(void)
207 struct cpuid_regs regs
;
211 regs
= (struct cpuid_regs
) {
217 case CPUID_VENDOR_INTEL_1
:
219 feature
= KVM__X86_FEATURE_VMX
;
222 case CPUID_VENDOR_AMD_1
:
223 eax_base
= 0x80000000;
224 feature
= KVM__X86_FEATURE_SVM
;
231 regs
= (struct cpuid_regs
) {
236 if (regs
.eax
< eax_base
+ 0x01)
239 regs
= (struct cpuid_regs
) {
240 .eax
= eax_base
+ 0x01
244 return regs
.ecx
& (1 << feature
);
248 * Note: KVM_SET_USER_MEMORY_REGION assumes that we don't pass overlapping
249 * memory regions to it. Therefore, be careful if you use this function for
250 * registering memory regions for emulating hardware.
252 void kvm__register_mem(struct kvm
*kvm
, u64 guest_phys
, u64 size
, void *userspace_addr
)
254 struct kvm_userspace_memory_region mem
;
257 mem
= (struct kvm_userspace_memory_region
) {
258 .slot
= kvm
->mem_slots
++,
259 .guest_phys_addr
= guest_phys
,
261 .userspace_addr
= (unsigned long)userspace_addr
,
264 ret
= ioctl(kvm
->vm_fd
, KVM_SET_USER_MEMORY_REGION
, &mem
);
266 die_perror("KVM_SET_USER_MEMORY_REGION ioctl");
270 * Allocating RAM size bigger than 4GB requires us to leave a gap
271 * in the RAM which is used for PCI MMIO, hotplug, and unconfigured
272 * devices (see documentation of e820_setup_gap() for details).
274 * If we're required to initialize RAM bigger than 4GB, we will create
275 * a gap between 0xe0000000 and 0x100000000 in the guest virtual mem space.
278 void kvm__init_ram(struct kvm
*kvm
)
280 u64 phys_start
, phys_size
;
283 if (kvm
->ram_size
< KVM_32BIT_GAP_START
) {
284 /* Use a single block of RAM for 32bit RAM */
287 phys_size
= kvm
->ram_size
;
288 host_mem
= kvm
->ram_start
;
290 kvm__register_mem(kvm
, phys_start
, phys_size
, host_mem
);
292 /* First RAM range from zero to the PCI gap: */
295 phys_size
= KVM_32BIT_GAP_START
;
296 host_mem
= kvm
->ram_start
;
298 kvm__register_mem(kvm
, phys_start
, phys_size
, host_mem
);
300 /* Second RAM range from 4GB to the end of RAM: */
302 phys_start
= 0x100000000ULL
;
303 phys_size
= kvm
->ram_size
- phys_size
;
304 host_mem
= kvm
->ram_start
+ phys_start
;
306 kvm__register_mem(kvm
, phys_start
, phys_size
, host_mem
);
310 int kvm__recommended_cpus(struct kvm
*kvm
)
314 ret
= ioctl(kvm
->sys_fd
, KVM_CHECK_EXTENSION
, KVM_CAP_NR_VCPUS
);
316 die_perror("KVM_CAP_NR_VCPUS");
322 * The following hack should be removed once 'x86: Raise the hard
323 * VCPU count limit' makes it's way into the mainline.
325 #ifndef KVM_CAP_MAX_VCPUS
326 #define KVM_CAP_MAX_VCPUS 66
329 int kvm__max_cpus(struct kvm
*kvm
)
333 ret
= ioctl(kvm
->sys_fd
, KVM_CHECK_EXTENSION
, KVM_CAP_MAX_VCPUS
);
335 ret
= kvm__recommended_cpus(kvm
);
340 struct kvm
*kvm__init(const char *kvm_dev
, u64 ram_size
, const char *name
)
342 struct kvm_pit_config pit_config
= { .flags
= 0, };
346 if (!kvm__cpu_supports_vm())
347 die("Your CPU does not support hardware virtualization");
351 kvm
->sys_fd
= open(kvm_dev
, O_RDWR
);
352 if (kvm
->sys_fd
< 0) {
354 die("'%s' not found. Please make sure your kernel has CONFIG_KVM enabled and that the KVM modules are loaded.", kvm_dev
);
356 die("'%s' KVM driver not available.\n # (If the KVM module is loaded then 'dmesg' may offer further clues about the failure.)", kvm_dev
);
358 fprintf(stderr
, " Fatal, could not open %s: ", kvm_dev
);
363 ret
= ioctl(kvm
->sys_fd
, KVM_GET_API_VERSION
, 0);
364 if (ret
!= KVM_API_VERSION
)
365 die_perror("KVM_API_VERSION ioctl");
367 kvm
->vm_fd
= ioctl(kvm
->sys_fd
, KVM_CREATE_VM
, 0);
369 die_perror("KVM_CREATE_VM ioctl");
371 if (kvm__check_extensions(kvm
))
372 die("A required KVM extention is not supported by OS");
374 ret
= ioctl(kvm
->vm_fd
, KVM_SET_TSS_ADDR
, 0xfffbd000);
376 die_perror("KVM_SET_TSS_ADDR ioctl");
378 ret
= ioctl(kvm
->vm_fd
, KVM_CREATE_PIT2
, &pit_config
);
380 die_perror("KVM_CREATE_PIT2 ioctl");
382 kvm
->ram_size
= ram_size
;
384 if (kvm
->ram_size
< KVM_32BIT_GAP_START
) {
385 kvm
->ram_start
= mmap(NULL
, ram_size
, PROT_RW
, MAP_ANON_NORESERVE
, -1, 0);
387 kvm
->ram_start
= mmap(NULL
, ram_size
+ KVM_32BIT_GAP_SIZE
, PROT_RW
, MAP_ANON_NORESERVE
, -1, 0);
388 if (kvm
->ram_start
!= MAP_FAILED
) {
390 * We mprotect the gap (see kvm__init_ram() for details) PROT_NONE so that
391 * if we accidently write to it, we will know.
393 mprotect(kvm
->ram_start
+ KVM_32BIT_GAP_START
, KVM_32BIT_GAP_SIZE
, PROT_NONE
);
396 if (kvm
->ram_start
== MAP_FAILED
)
397 die("out of memory");
399 madvise(kvm
->ram_start
, kvm
->ram_size
, MADV_MERGEABLE
);
401 ret
= ioctl(kvm
->vm_fd
, KVM_CREATE_IRQCHIP
);
403 die_perror("KVM_CREATE_IRQCHIP ioctl");
407 kvm__create_pidfile(kvm
);
412 #define BOOT_LOADER_SELECTOR 0x1000
413 #define BOOT_LOADER_IP 0x0000
414 #define BOOT_LOADER_SP 0x8000
415 #define BOOT_CMDLINE_OFFSET 0x20000
417 #define BOOT_PROTOCOL_REQUIRED 0x206
418 #define LOAD_HIGH 0x01
420 static int load_flat_binary(struct kvm
*kvm
, int fd
)
425 if (lseek(fd
, 0, SEEK_SET
) < 0)
428 p
= guest_real_to_host(kvm
, BOOT_LOADER_SELECTOR
, BOOT_LOADER_IP
);
430 while ((nr
= read(fd
, p
, 65536)) > 0)
433 kvm
->boot_selector
= BOOT_LOADER_SELECTOR
;
434 kvm
->boot_ip
= BOOT_LOADER_IP
;
435 kvm
->boot_sp
= BOOT_LOADER_SP
;
440 static const char *BZIMAGE_MAGIC
= "HdrS";
442 static bool load_bzimage(struct kvm
*kvm
, int fd_kernel
,
443 int fd_initrd
, const char *kernel_cmdline
, u16 vidmode
)
445 struct boot_params
*kern_boot
;
446 unsigned long setup_sects
;
447 struct boot_params boot
;
454 * See Documentation/x86/boot.txt for details no bzImage on-disk and
458 if (lseek(fd_kernel
, 0, SEEK_SET
) < 0)
461 if (read(fd_kernel
, &boot
, sizeof(boot
)) != sizeof(boot
))
464 if (memcmp(&boot
.hdr
.header
, BZIMAGE_MAGIC
, strlen(BZIMAGE_MAGIC
)))
467 if (boot
.hdr
.version
< BOOT_PROTOCOL_REQUIRED
)
468 die("Too old kernel");
470 if (lseek(fd_kernel
, 0, SEEK_SET
) < 0)
473 if (!boot
.hdr
.setup_sects
)
474 boot
.hdr
.setup_sects
= BZ_DEFAULT_SETUP_SECTS
;
475 setup_sects
= boot
.hdr
.setup_sects
+ 1;
477 setup_size
= setup_sects
<< 9;
478 p
= guest_real_to_host(kvm
, BOOT_LOADER_SELECTOR
, BOOT_LOADER_IP
);
480 /* copy setup.bin to mem*/
481 if (read(fd_kernel
, p
, setup_size
) != setup_size
)
484 /* copy vmlinux.bin to BZ_KERNEL_START*/
485 p
= guest_flat_to_host(kvm
, BZ_KERNEL_START
);
487 while ((nr
= read(fd_kernel
, p
, 65536)) > 0)
490 p
= guest_flat_to_host(kvm
, BOOT_CMDLINE_OFFSET
);
491 if (kernel_cmdline
) {
492 cmdline_size
= strlen(kernel_cmdline
) + 1;
493 if (cmdline_size
> boot
.hdr
.cmdline_size
)
494 cmdline_size
= boot
.hdr
.cmdline_size
;
496 memset(p
, 0, boot
.hdr
.cmdline_size
);
497 memcpy(p
, kernel_cmdline
, cmdline_size
- 1);
500 kern_boot
= guest_real_to_host(kvm
, BOOT_LOADER_SELECTOR
, 0x00);
502 kern_boot
->hdr
.cmd_line_ptr
= BOOT_CMDLINE_OFFSET
;
503 kern_boot
->hdr
.type_of_loader
= 0xff;
504 kern_boot
->hdr
.heap_end_ptr
= 0xfe00;
505 kern_boot
->hdr
.loadflags
|= CAN_USE_HEAP
;
506 kern_boot
->hdr
.vid_mode
= vidmode
;
509 * Read initrd image into guest memory
511 if (fd_initrd
>= 0) {
512 struct stat initrd_stat
;
515 if (fstat(fd_initrd
, &initrd_stat
))
518 addr
= boot
.hdr
.initrd_addr_max
& ~0xfffff;
520 if (addr
< BZ_KERNEL_START
)
521 die("Not enough memory for initrd");
522 else if (addr
< (kvm
->ram_size
- initrd_stat
.st_size
))
527 p
= guest_flat_to_host(kvm
, addr
);
528 nr
= read(fd_initrd
, p
, initrd_stat
.st_size
);
529 if (nr
!= initrd_stat
.st_size
)
530 die("Failed to read initrd");
532 kern_boot
->hdr
.ramdisk_image
= addr
;
533 kern_boot
->hdr
.ramdisk_size
= initrd_stat
.st_size
;
536 kvm
->boot_selector
= BOOT_LOADER_SELECTOR
;
538 * The real-mode setup code starts at offset 0x200 of a bzImage. See
539 * Documentation/x86/boot.txt for details.
541 kvm
->boot_ip
= BOOT_LOADER_IP
+ 0x200;
542 kvm
->boot_sp
= BOOT_LOADER_SP
;
548 #define GZIP_ID1 0x1f
549 #define GZIP_ID2 0x8b
551 static bool initrd_check(int fd
)
555 if (read_in_full(fd
, id
, ARRAY_SIZE(id
)) < 0)
558 if (lseek(fd
, 0, SEEK_SET
) < 0)
561 return id
[0] == GZIP_ID1
&& id
[1] == GZIP_ID2
;
564 bool kvm__load_kernel(struct kvm
*kvm
, const char *kernel_filename
,
565 const char *initrd_filename
, const char *kernel_cmdline
, u16 vidmode
)
568 int fd_kernel
= -1, fd_initrd
= -1;
570 fd_kernel
= open(kernel_filename
, O_RDONLY
);
572 die("Unable to open kernel %s", kernel_filename
);
574 if (initrd_filename
) {
575 fd_initrd
= open(initrd_filename
, O_RDONLY
);
577 die("Unable to open initrd %s", initrd_filename
);
579 if (!initrd_check(fd_initrd
))
580 die("%s is not an initrd", initrd_filename
);
583 ret
= load_bzimage(kvm
, fd_kernel
, fd_initrd
, kernel_cmdline
, vidmode
);
591 pr_warning("%s is not a bzImage. Trying to load it as a flat binary...", kernel_filename
);
593 ret
= load_flat_binary(kvm
, fd_kernel
);
599 die("%s is not a valid bzImage or flat binary", kernel_filename
);
608 * kvm__setup_bios - inject BIOS into guest system memory
609 * @kvm - guest system descriptor
611 * This function is a main routine where we poke guest memory
612 * and install BIOS there.
614 void kvm__setup_bios(struct kvm
*kvm
)
616 /* standart minimal configuration */
619 /* FIXME: SMP, ACPI and friends here */
622 mptable_setup(kvm
, kvm
->nrcpus
);
625 #define TIMER_INTERVAL_NS 1000000 /* 1 msec */
628 * This function sets up a timer that's used to inject interrupts from the
629 * userspace hypervisor into the guest at periodical intervals. Please note
630 * that clock interrupt, for example, is not handled here.
632 void kvm__start_timer(struct kvm
*kvm
)
634 struct itimerspec its
;
637 memset(&sev
, 0, sizeof(struct sigevent
));
638 sev
.sigev_value
.sival_int
= 0;
639 sev
.sigev_notify
= SIGEV_THREAD_ID
;
640 sev
.sigev_signo
= SIGALRM
;
641 sev
._sigev_un
._tid
= syscall(__NR_gettid
);
643 if (timer_create(CLOCK_REALTIME
, &sev
, &kvm
->timerid
) < 0)
644 die("timer_create()");
646 its
.it_value
.tv_sec
= TIMER_INTERVAL_NS
/ 1000000000;
647 its
.it_value
.tv_nsec
= TIMER_INTERVAL_NS
% 1000000000;
648 its
.it_interval
.tv_sec
= its
.it_value
.tv_sec
;
649 its
.it_interval
.tv_nsec
= its
.it_value
.tv_nsec
;
651 if (timer_settime(kvm
->timerid
, 0, &its
, NULL
) < 0)
652 die("timer_settime()");
655 void kvm__stop_timer(struct kvm
*kvm
)
658 if (timer_delete(kvm
->timerid
) < 0)
659 die("timer_delete()");
664 void kvm__irq_line(struct kvm
*kvm
, int irq
, int level
)
666 struct kvm_irq_level irq_level
;
668 irq_level
= (struct kvm_irq_level
) {
675 if (ioctl(kvm
->vm_fd
, KVM_IRQ_LINE
, &irq_level
) < 0)
676 die_perror("KVM_IRQ_LINE failed");
679 void kvm__dump_mem(struct kvm
*kvm
, unsigned long addr
, unsigned long size
)
684 size
&= ~7; /* mod 8 */
688 p
= guest_flat_to_host(kvm
, addr
);
690 for (n
= 0; n
< size
; n
+= 8) {
691 if (!host_ptr_in_ram(kvm
, p
+ n
))
694 printf(" 0x%08lx: %02x %02x %02x %02x %02x %02x %02x %02x\n",
695 addr
+ n
, p
[n
+ 0], p
[n
+ 1], p
[n
+ 2], p
[n
+ 3],
696 p
[n
+ 4], p
[n
+ 5], p
[n
+ 6], p
[n
+ 7]);
700 void kvm__pause(void)
702 int i
, paused_vcpus
= 0;
704 /* Check if the guest is running */
705 if (!kvm_cpus
[0] || kvm_cpus
[0]->thread
== 0)
708 mutex_lock(&pause_lock
);
710 pause_event
= eventfd(0, 0);
712 die("Failed creating pause notification event");
713 for (i
= 0; i
< kvm
->nrcpus
; i
++)
714 pthread_kill(kvm_cpus
[i
]->thread
, SIGKVMPAUSE
);
716 while (paused_vcpus
< kvm
->nrcpus
) {
719 if (read(pause_event
, &cur_read
, sizeof(cur_read
)) < 0)
720 die("Failed reading pause event");
721 paused_vcpus
+= cur_read
;
726 void kvm__continue(void)
728 /* Check if the guest is running */
729 if (!kvm_cpus
[0] || kvm_cpus
[0]->thread
== 0)
732 mutex_unlock(&pause_lock
);
735 void kvm__notify_paused(void)
739 if (write(pause_event
, &p
, sizeof(p
)) < 0)
740 die("Failed notifying of paused VCPU.");
742 mutex_lock(&pause_lock
);
743 mutex_unlock(&pause_lock
);