Sets some kvm_run out flags before going to userspace.
[freebsd-src/fkvm-freebsd.git] / sys / kern / kern_fkvm.c
blob45f32b1f55400b72649a5b1b7fef8f50fb7757ad
1 /*-
2 * Copyright (c) 2008 Brent Stephens <brents@rice.edu>
3 * Copyright (c) 2008 Diego Ongaro <diego.ongaro@rice.edu>
4 * Copyright (c) 2008 Kaushik Kumar Ram <kaushik@rice.edu>
5 * Copyright (c) 2008 Oleg Pesok <olegpesok@gmail.com>
6 * All rights reserved.
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
30 #include <sys/fkvm.h>
31 #include <sys/cdefs.h>
32 #include <sys/param.h>
33 #include <sys/systm.h>
34 #include <sys/kernel.h>
35 #include <sys/malloc.h>
36 #include <sys/sysproto.h>
37 #include <sys/file.h>
38 #include <sys/mman.h>
39 #include <sys/proc.h>
40 #include <sys/eventhandler.h>
41 #include <vm/vm.h>
42 #include <vm/pmap.h>
43 #include <vm/vm_extern.h>
44 #include <vm/vm_map.h>
45 #include <vm/vm_object.h>
46 #include <vm/vm_param.h>
47 #include <machine/_inttypes.h>
48 #include <machine/specialreg.h>
49 #include <machine/segments.h>
50 #include <machine/vmcb.h>
53 /* Definitions for Port IO */
54 #define PORT_SHIFT 16
55 #define ADDR_SHIFT 7
56 #define SIZE_SHIFT 4
57 #define REP_SHIFT 3
58 #define STR_SHIFT 2
59 #define TYPE_SHIFT 0
61 #define PORT_MASK 0xFFFF0000
62 #define ADDR_MASK (7 << ADDR_SHIFT)
63 #define SIZE_MASK (7 << SIZE_SHIFT)
64 #define REP_MASK (1 << REP_SHIFT)
65 #define STR_MASK (1 << STR_SHIFT)
66 #define TYPE_MASK (1 << TYPE_SHIFT)
67 /* End Definitions for Port IO */
69 #define PMIO_PAGE_OFFSET 1
71 #define IOPM_SIZE (8*1024 + 1) /* TODO: ensure that this need not be 12Kbtes, not just 8Kb+1 */
72 #define MSRPM_SIZE (8*1024)
74 /* fkvm data */
76 static int fkvm_loaded = 0;
78 static void *iopm = NULL; /* Should I allocate a vm_object_t instead? */
79 static void *msrpm = NULL; /* Should I allocate a vm_object_t instead? */
81 static void *hsave_area = NULL;
83 static eventhandler_tag exit_tag;
85 /* per-guest data */
87 enum {
88 VCPU_REGS_RAX = 0,
89 VCPU_REGS_RCX = 1,
90 VCPU_REGS_RDX = 2,
91 VCPU_REGS_RBX = 3,
92 VCPU_REGS_RSP = 4,
93 VCPU_REGS_RBP = 5,
94 VCPU_REGS_RSI = 6,
95 VCPU_REGS_RDI = 7,
96 VCPU_REGS_R8 = 8,
97 VCPU_REGS_R9 = 9,
98 VCPU_REGS_R10 = 10,
99 VCPU_REGS_R11 = 11,
100 VCPU_REGS_R12 = 12,
101 VCPU_REGS_R13 = 13,
102 VCPU_REGS_R14 = 14,
103 VCPU_REGS_R15 = 15,
104 VCPU_REGS_RIP,
105 NR_VCPU_REGS
108 struct vcpu {
109 /* VCPU data */
110 struct vmcb *vmcb;
111 unsigned long vmcb_pa;
113 unsigned long regs[NR_VCPU_REGS];
114 u_int64_t host_fs_base;
115 u_int64_t host_gs_base;
116 u_int64_t cr2;
117 u_int64_t cr3;
119 struct {
120 uint64_t default_type;
121 uint64_t mtrr64k[MTRR_N64K/8];
122 uint64_t mtrr16k[MTRR_N16K/8];
123 uint64_t mtrr4k [MTRR_N4K /8];
124 #define FKVM_MTRR_NVAR 8
125 uint64_t mtrrvar[FKVM_MTRR_NVAR *2];
126 } mtrrs;
128 struct guestvm *guest_vm;
131 struct guestvm {
132 struct vcpu *vcpus[MAX_VCPUS];
133 int nr_vcpus;
135 struct vmspace *sp;
136 u_int64_t nested_cr3;
140 #define SVM_VMLOAD ".byte 0x0f, 0x01, 0xda"
141 #define SVM_VMRUN ".byte 0x0f, 0x01, 0xd8"
142 #define SVM_VMSAVE ".byte 0x0f, 0x01, 0xdb"
143 #define SVM_CLGI ".byte 0x0f, 0x01, 0xdd"
144 #define SVM_STGI ".byte 0x0f, 0x01, 0xdc"
145 #define SVM_INVLPGA ".byte 0x0f, 0x01, 0xdf"
147 static inline struct vcpu *
148 TD_GET_VCPU(struct thread *td)
150 struct vcpu *vcpu;
151 vcpu = td->vcpu;
152 if (vcpu == NULL)
153 printf("TD_GET_VCPU -> NULL\n");
154 return vcpu;
157 static inline void
158 TD_SET_VCPU(struct thread *td, struct vcpu *vcpu)
160 td->vcpu = vcpu;
163 static inline struct guestvm *
164 PROC_GET_GUESTVM(struct proc *proc)
166 struct guestvm *guestvm;
167 guestvm = proc->p_guestvm;
168 return guestvm;
171 static inline void
172 PROC_SET_GUESTVM(struct proc *proc, struct guestvm *guestvm)
174 proc->p_guestvm = guestvm; \
177 static void
178 print_vmcb_seg(struct vmcb_seg* vmcb_seg, const char* name)
180 printf("%s Selector\n", name);
181 printf("Selector : %" PRIx16 "\n", vmcb_seg->selector);
182 printf("Attributes : %" PRIx16 "\n", vmcb_seg->attrib);
183 printf("Limit : %" PRIx32 "\n", vmcb_seg->limit);
184 printf("Base Address : %" PRIx64 "\n", vmcb_seg->base);
185 printf("\n");
188 static void
189 print_vmcb(struct vmcb *vmcb)
191 printf("VMCB Control Area\n");
192 printf("Intercept CR Reads : %" PRIx16 "\n", vmcb->control.intercept_cr_reads);
193 printf("Intercept CR Writes : %" PRIx16 "\n", vmcb->control.intercept_cr_writes);
194 printf("Intercept DR Reads : %" PRIx16 "\n", vmcb->control.intercept_dr_reads);
195 printf("Intercept DR Writes : %" PRIx16 "\n", vmcb->control.intercept_dr_writes);
196 printf("Intercept Exceptions : %" PRIx32 "\n", vmcb->control.intercept_exceptions);
197 printf("Intercepts : %" PRIx64 "\n", vmcb->control.intercepts);
198 printf("Reserved 1: \n");
199 for(int i=0; i < 44; i++) {
200 printf("%" PRIx8 "", vmcb->control.reserved_1[i]); /* Should be Zero */
202 printf("\n");
203 printf("IOPM Base PA : %" PRIx64 "\n", vmcb->control.iopm_base_pa);
204 printf("MSRPM Base PA : %" PRIx64 "\n", vmcb->control.msrpm_base_pa);
205 printf("TSC Offset : %" PRIx64 "\n", vmcb->control.tsc_offset);
206 printf("Guest ASID : %" PRIx32 "\n", vmcb->control.guest_asid);
207 printf("TLB Control : %" PRIx8 "\n", vmcb->control.tlb_control);
208 printf("Reserved 2 : \n");
209 for(int i=0; i < 3; i++) {
210 printf("%" PRIx8 "", vmcb->control.reserved_1[i]); /* Should be Zero */
212 printf("\n");
213 printf("Virtual TPR : %" PRIx8 "\n", vmcb->control.v_tpr);
214 printf("Virtual IRQ : %" PRIx8 "\n", vmcb->control.v_irq);
215 printf("Virtual Interrupt : %" PRIx8 "\n", vmcb->control.v_intr);
216 printf("Virtual Interrupt Masking: %" PRIx8 "\n", vmcb->control.v_intr_masking);
217 printf("Virtual Interrupt Vector : %" PRIx8 "\n", vmcb->control.v_intr_vector);
218 printf("Reserved 6 : \n");
219 for(int i=0; i < 3; i++) {
220 printf("%" PRIx8 "", vmcb->control.reserved_6[i]); /* Should be Zero */
222 printf("\n");
223 printf("Interrupt Shadow : %" PRIx8 "\n", vmcb->control.intr_shadow);
224 printf("Reserved 7 : \n");
225 for(int i=0; i < 7; i++) {
226 printf("%" PRIx8 "", vmcb->control.reserved_7[i]); /* Should be Zero */
228 printf("\n");
229 printf("Exit Code : %" PRIx64 "\n", vmcb->control.exit_code);
230 printf("Exit Info 1 : %" PRIx64 "\n", vmcb->control.exit_info_1);
231 printf("Exit Info 2 : %" PRIx64 "\n", vmcb->control.exit_info_2);
232 printf("Exit Interrupt Info : %" PRIx32 "\n", vmcb->control.exit_int_info);
233 printf("Exit Interrupt Info Err Code: %" PRIx32 "\n", vmcb->control.exit_int_info_err_code);
234 printf("Nested Control : %" PRIx64 "\n", vmcb->control.nested_ctl);
235 printf("Reserved 8 : \n");
236 for(int i=0; i < 16; i++) {
237 printf("%" PRIx8 "", vmcb->control.reserved_8[i]); /* Should be Zero */
239 printf("\n");
240 printf("Event Injection : %" PRIx64 "\n", vmcb->control.event_inj);
241 printf("Nested CR3 : %" PRIx64 "\n", vmcb->control.nested_cr3);
242 printf("LBR Virtualization Enable: %" PRIx64 "\n", vmcb->control.lbr_virt_enable);
243 printf("Reserved 9 : \n");
244 for(int i=0; i < 832; i++) {
245 printf("%" PRIx8 "", vmcb->control.reserved_9[i]); /* Should be Zero */
247 printf("\n");
249 printf("\n");
251 printf("VMCB Save Area\n");
252 print_vmcb_seg(&(vmcb->save.es), "ES");
253 print_vmcb_seg(&(vmcb->save.cs), "CS");
254 print_vmcb_seg(&(vmcb->save.ss), "SS");
255 print_vmcb_seg(&(vmcb->save.ds), "DS");
256 print_vmcb_seg(&(vmcb->save.fs), "FS");
257 print_vmcb_seg(&(vmcb->save.gs), "GS");
258 print_vmcb_seg(&(vmcb->save.gdtr), "GDTR");
259 print_vmcb_seg(&(vmcb->save.ldtr), "LDTR");
260 print_vmcb_seg(&(vmcb->save.idtr), "IDTR");
261 print_vmcb_seg(&(vmcb->save.tr), "TR");
262 printf("Reserved 1 : \n");
263 for(int i=0; i < 43; i++) {
264 printf("%" PRIx8 "", vmcb->save.reserved_1[i]); /* Should be Zero */
266 printf("\n");
267 printf("Current Processor Level : %" PRIx8 "\n", vmcb->save.cpl);
268 printf("Reserved 2 : \n");
269 for(int i=0; i < 4; i++) {
270 printf("%" PRIx8 "", vmcb->save.reserved_2[i]); /* Should be Zero */
272 printf("\n");
273 printf("EFER : %" PRIx64 "\n", vmcb->save.efer);
274 printf("Reserved 3 : \n");
275 for(int i=0; i < 112; i++) {
276 printf("%" PRIx8 "", vmcb->save.reserved_3[i]); /* Should be Zero */
278 printf("\n");
279 printf("Control Register 4 : %" PRIx64 "\n", vmcb->save.cr4);
280 printf("Control Register 3 : %" PRIx64 "\n", vmcb->save.cr3);
281 printf("Control Register 0 : %" PRIx64 "\n", vmcb->save.cr0);
282 printf("Debug Register 7 : %" PRIx64 "\n", vmcb->save.dr7);
283 printf("Debug Register 6 : %" PRIx64 "\n", vmcb->save.dr6);
284 printf("RFlags : %" PRIx64 "\n", vmcb->save.rflags);
285 printf("RIP : %" PRIx64 "\n", vmcb->save.rip);
286 printf("Reserved 4 : \n");
287 for(int i=0; i < 88; i++) {
288 printf("%" PRIx8 "", vmcb->save.reserved_4[i]); /* Should be Zero */
290 printf("\n");
291 printf("RSP : %" PRIx64 "\n", vmcb->save.rsp);
292 printf("Reserved 5 : \n");
293 for(int i=0; i < 24; i++) {
294 printf("%" PRIx8 "", vmcb->save.reserved_5[i]); /* Should be Zero */
296 printf("\n");
297 printf("RAX : %" PRIx64 "\n", vmcb->save.rax);
298 printf("STAR : %" PRIx64 "\n", vmcb->save.star);
299 printf("LSTAR : %" PRIx64 "\n", vmcb->save.lstar);
300 printf("CSTAR : %" PRIx64 "\n", vmcb->save.cstar);
301 printf("SFMASK : %" PRIx64 "\n", vmcb->save.sfmask);
302 printf("Kernel GS Base : %" PRIx64 "\n", vmcb->save.kernel_gs_base);
303 printf("SYSENTER CS : %" PRIx64 "\n", vmcb->save.sysenter_cs);
304 printf("SYSENTER ESP : %" PRIx64 "\n", vmcb->save.sysenter_esp);
305 printf("SYSENTER EIP : %" PRIx64 "\n", vmcb->save.sysenter_eip);
306 printf("Control Register 2 : %" PRIx64 "\n", vmcb->save.cr2);
307 printf("Reserved 6 : \n");
308 for(int i=0; i < 32; i++) {
309 printf("%" PRIx8 "", vmcb->save.reserved_6[i]); /* Should be Zero */
311 printf("\n");
312 printf("Global PAT : %" PRIx64 "\n", vmcb->save.g_pat);
313 printf("Debug Control : %" PRIx64 "\n", vmcb->save.dbg_ctl);
314 printf("BR From : %" PRIx64 "\n", vmcb->save.br_from);
315 printf("BR To : %" PRIx64 "\n", vmcb->save.br_to);
316 printf("Last Exception From : %" PRIx64 "\n", vmcb->save.last_excp_from);
317 printf("Last Exception To : %" PRIx64 "\n", vmcb->save.last_excp_to);
319 printf("\n\n");
322 #if 0
323 static void
324 print_tss_desc(struct system_segment_descriptor *tss_desc)
326 printf("TSS desc @ %p:\n", tss_desc);
327 printf("sd_lolimit: 0x%" PRIx64 "\n", (u_int64_t) tss_desc->sd_lolimit);
328 printf("sd_lobase: 0x%" PRIx64 "\n", (u_int64_t) tss_desc->sd_lobase);
329 printf("sd_type: 0x%" PRIx64 "\n", (u_int64_t) tss_desc->sd_type);
330 printf("sd_dpl: 0x%" PRIx64 "\n", (u_int64_t) tss_desc->sd_dpl);
331 printf("sd_p: 0x%" PRIx64 "\n", (u_int64_t) tss_desc->sd_p);
332 printf("sd_hilimit: 0x%" PRIx64 "\n", (u_int64_t) tss_desc->sd_hilimit);
333 printf("sd_xx0: 0x%" PRIx64 "\n", (u_int64_t) tss_desc->sd_xx0);
334 printf("sd_gran: 0x%" PRIx64 "\n", (u_int64_t) tss_desc->sd_gran);
335 printf("sd_hibase: 0x%" PRIx64 "\n", (u_int64_t) tss_desc->sd_hibase);
336 printf("sd_xx1: 0x%" PRIx64 "\n", (u_int64_t) tss_desc->sd_xx1);
337 printf("sd_mbz: 0x%" PRIx64 "\n", (u_int64_t) tss_desc->sd_mbz);
338 printf("sd_xx2: 0x%" PRIx64 "\n", (u_int64_t) tss_desc->sd_xx2);
339 printf("\n\n");
342 static void
343 print_tss(struct system_segment_descriptor *tss_desc)
345 u_int32_t *base;
346 int limit;
347 int i;
349 base = (u_int32_t*) ((((u_int64_t) tss_desc->sd_hibase) << 24) | ((u_int64_t) tss_desc->sd_lobase));
350 limit = ((tss_desc->sd_hilimit << 16) | tss_desc->sd_lolimit) / 4;
352 printf("TSS: @ %p\n", base);
353 for (i = 0; i <= limit; i++)
354 printf("%x: 0x%" PRIx32 "\n", i, base[i]);
355 printf("\n\n");
357 #endif
359 static inline void
360 print_vmcb_save_area(struct vmcb *vmcb)
362 printf("VMCB save area:\n");
363 printf(" cs: [selector %" PRIx16 ", attrib %" PRIx16 ", limit %" PRIx32 ", base %" PRIx64 "]\n",
364 vmcb->save.cs.selector,
365 vmcb->save.cs.attrib,
366 vmcb->save.cs.limit,
367 vmcb->save.cs.base);
368 printf(" fs: [selector %" PRIx16 ", attrib %" PRIx16 ", limit %" PRIx32 ", base %" PRIx64 "]\n",
369 vmcb->save.fs.selector,
370 vmcb->save.fs.attrib,
371 vmcb->save.fs.limit,
372 vmcb->save.fs.base);
373 printf(" gs: [selector %" PRIx16 ", attrib %" PRIx16 ", limit %" PRIx32 ", base %" PRIx64 "]\n",
374 vmcb->save.gs.selector,
375 vmcb->save.gs.attrib,
376 vmcb->save.gs.limit,
377 vmcb->save.gs.base);
378 printf(" tr: [selector %" PRIx16 ", attrib %" PRIx16 ", limit %" PRIx32 ", base %" PRIx64 "]\n",
379 vmcb->save.tr.selector,
380 vmcb->save.tr.attrib,
381 vmcb->save.tr.limit,
382 vmcb->save.tr.base);
383 printf(" ldtr: [selector %" PRIx16 ", attrib %" PRIx16 ", limit %" PRIx32 ", base %" PRIx64 "]\n",
384 vmcb->save.ldtr.selector,
385 vmcb->save.ldtr.attrib,
386 vmcb->save.ldtr.limit,
387 vmcb->save.ldtr.base);
388 printf(" rip: %" PRIx64 "\n", vmcb->save.rip);
389 printf(" kernel_gs_base: %" PRIx64 "\n", vmcb->save.kernel_gs_base);
390 printf(" star: %" PRIx64 "\n", vmcb->save.star);
391 printf(" lstar: %" PRIx64 "\n", vmcb->save.lstar);
392 printf(" cstar: %" PRIx64 "\n", vmcb->save.cstar);
393 printf(" sfmask: %" PRIx64 "\n", vmcb->save.sfmask);
394 printf(" sysenter_cs: %" PRIx64 "\n", vmcb->save.sysenter_cs);
395 printf(" sysenter_esp: %" PRIx64 "\n", vmcb->save.sysenter_esp);
396 printf(" sysenter_eip: %" PRIx64 "\n", vmcb->save.sysenter_eip);
397 printf("\n\n");
400 static int
401 vmrun_assert(struct vmcb *vmcb)
403 #define A(cond) do { if ((cond)) { printf("Error: assertion not met on line %d\n", __LINE__); bad = 1; } } while (0)
405 int bad;
407 bad = 0;
409 // The following are illegal:
411 //EFER.SVME is zero.
412 A((vmcb->save.efer & 0x0000000000001000) == 0);
414 // CR0.CD is zero and CR0.NW is set
415 A( ((vmcb->save.cr0 & 0x0000000040000000) == 0) &&
416 ((vmcb->save.cr0 & 0x0000000020000000) != 0));
418 // CR0[63:32] are not zero.
419 A((vmcb->save.cr0 & 0xFFFFFFFF00000000) == 0xFFFFFFFF00000000);
421 // Any MBZ bit of CR3 is set.
422 A((vmcb->save.cr3 & 0xFFF0000000000000) != 0);
424 // CR4[63:11] are not zero.
425 A((vmcb->save.cr4 & 0xFFFFFFFFFFFFF800) == 0xFFFFFFFFFFFFF800);
427 // DR6[63:32] are not zero.
428 A((vmcb->save.dr6 & 0xFFFFFFFF00000000) == 0xFFFFFFFF00000000);
430 // DR7[63:32] are not zero.
431 A((vmcb->save.dr7 & 0xFFFFFFFF00000000) == 0xFFFFFFFF00000000);
433 // EFER[63:15] are not zero.
434 A((vmcb->save.efer & 0xFFFFFFFFFFFF8000) == 0xFFFFFFFFFFF8000);
436 // EFER.LMA or EFER.LME is non-zero and this processor does not support long mode.
437 //// A((vmcb->save.efer & 0x0000000000000500) != 0);
439 // EFER.LME and CR0.PG are both set and CR4.PAE is zero.
440 A( ((vmcb->save.efer & 0x0000000000000100) != 0) &&
441 ((vmcb->save.cr0 & 0x0000000080000000) != 0) &&
442 ((vmcb->save.cr4 & 0x0000000000000020) != 0));
444 // EFER.LME and CR0.PG are both non-zero and CR0.PE is zero.
445 A( ((vmcb->save.efer & 0x0000000000000100) != 0) &&
446 ((vmcb->save.cr0 & 0x0000000080000000) != 0) &&
447 ((vmcb->save.cr0 & 0x0000000000000001) == 0));
449 // EFER.LME, CR0.PG, CR4.PAE, CS.L, and CS.D are all non-zero.
450 // cs.attrib = concat 55-52 and 47-40 (p372 v2)
451 A( ((vmcb->save.efer & 0x0000000000000100) != 0) &&
452 ((vmcb->save.cr0 & 0x0000000080000000) != 0) &&
453 ((vmcb->save.cr4 & 0x0000000000000020) != 0) &&
454 ((vmcb->save.cs.attrib & 0x0200) != 0) &&
455 ((vmcb->save.cs.attrib & 0x0400) != 0));
457 // The VMRUN intercept bit is clear.
458 A((vmcb->control.intercepts & 0x0000000100000000) == 0);
460 // The MSR or IOIO intercept tables extend to a physical address that is
461 // greater than or equal to the maximum supported physical address.
463 // Illegal event injection (see Section 15.19 on page 391).
465 // ASID is equal to zero.
466 A(vmcb->control.guest_asid == 0);
468 // VMRUN can load a guest value of CR0 with PE = 0 but PG = 1, a
469 // combination that is otherwise illegal (see Section 15.18).
471 // In addition to consistency checks, VMRUN and #VMEXIT canonicalize (i.e.,
472 // sign-extend to 63 bits) all base addresses in the segment registers
473 // that have been loaded.
475 return bad;
477 #undef A
480 static void
481 fkvm_vcpu_run(struct vcpu *vcpu)
483 u_int64_t lstar;
484 u_int64_t cstar;
485 u_int64_t star;
486 u_int64_t sfmask;
488 u_short fs_selector;
489 u_short gs_selector;
490 u_short ldt_selector;
492 unsigned long host_cr2;
493 unsigned long host_dr6;
494 unsigned long host_dr7;
496 struct system_segment_descriptor *tss_desc;
497 u_int64_t sel;
499 struct vmcb *vmcb;
501 //printf("begin fkvm_vcpu_run\n");
503 vmcb = vcpu->vmcb;
505 if (vmrun_assert(vmcb))
506 return;
508 tss_desc = (struct system_segment_descriptor*) (&gdt[GPROC0_SEL]);
509 sel = GSEL(GPROC0_SEL, SEL_KPL);
511 // printf("GSEL(GPROC0_SEL, SEL_KPL)=0x%" PRIx64 "\n", sel);
512 // print_tss_desc(tss_desc);
513 // print_tss(tss_desc);
515 // print_vmcb_save_area(vmcb);
516 // printf("vcpu->regs[VCPU_REGS_RIP]: 0x%lx\n", vcpu->regs[VCPU_REGS_RIP]);
517 // disable_intr();
519 vmcb->save.rax = vcpu->regs[VCPU_REGS_RAX];
520 vmcb->save.rsp = vcpu->regs[VCPU_REGS_RSP];
521 vmcb->save.rip = vcpu->regs[VCPU_REGS_RIP];
523 /* meh: kvm has pre_svm_run(svm); */
525 vcpu->host_fs_base = rdmsr(MSR_FSBASE);
526 vcpu->host_gs_base = rdmsr(MSR_GSBASE);
527 // printf("host_fs_base: 0x%" PRIx64 "\n", vcpu->host_fs_base);
528 // printf("host_gs_base: 0x%" PRIx64 "\n", vcpu->host_gs_base);
530 fs_selector = rfs();
531 gs_selector = rgs();
532 ldt_selector = rldt();
533 // printf("fs selector: %hx\n", fs_selector);
534 // printf("gs selector: %hx\n", gs_selector);
535 // printf("ldt selector: %hx\n", ldt_selector);
537 host_cr2 = rcr2();
539 host_dr6 = rdr6();
540 host_dr7 = rdr7();
542 vmcb->save.cr2 = vcpu->cr2;
543 /* meh: cr3? */
544 // TODO: something with apic_base?
546 /* meh: dr7? db_regs? */
548 // printf("MSR_STAR: %" PRIx64 "\n", rdmsr(MSR_STAR));
549 // printf("MSR_LSTAR: %" PRIx64 "\n", rdmsr(MSR_LSTAR));
550 // printf("MSR_CSTAR: %" PRIx64 "\n", rdmsr(MSR_CSTAR));
551 // printf("MSR_SF_MASK: %" PRIx64 "\n", rdmsr(MSR_SF_MASK));
553 star = rdmsr(MSR_STAR);
554 lstar = rdmsr(MSR_LSTAR);
555 cstar = rdmsr(MSR_CSTAR);
556 sfmask = rdmsr(MSR_SF_MASK);
558 // printf("CLGI...\n");
560 __asm __volatile (SVM_CLGI);
563 // enable_intr();
565 __asm __volatile (
566 "push %%rbp; \n\t"
567 "mov %c[rbx](%[svm]), %%rbx \n\t"
568 "mov %c[rcx](%[svm]), %%rcx \n\t"
569 "mov %c[rdx](%[svm]), %%rdx \n\t"
570 "mov %c[rsi](%[svm]), %%rsi \n\t"
571 "mov %c[rdi](%[svm]), %%rdi \n\t"
572 "mov %c[rbp](%[svm]), %%rbp \n\t"
573 "mov %c[r8](%[svm]), %%r8 \n\t"
574 "mov %c[r9](%[svm]), %%r9 \n\t"
575 "mov %c[r10](%[svm]), %%r10 \n\t"
576 "mov %c[r11](%[svm]), %%r11 \n\t"
577 "mov %c[r12](%[svm]), %%r12 \n\t"
578 "mov %c[r13](%[svm]), %%r13 \n\t"
579 "mov %c[r14](%[svm]), %%r14 \n\t"
580 "mov %c[r15](%[svm]), %%r15 \n\t"
582 /* Enter guest mode */
583 "push %%rax \n\t"
584 "mov %c[vmcb](%[svm]), %%rax \n\t"
585 SVM_VMLOAD "\n\t"
586 SVM_VMRUN "\n\t"
587 SVM_VMSAVE "\n\t"
588 "pop %%rax \n\t"
590 /* Save guest registers, load host registers */
591 "mov %%rbx, %c[rbx](%[svm]) \n\t"
592 "mov %%rcx, %c[rcx](%[svm]) \n\t"
593 "mov %%rdx, %c[rdx](%[svm]) \n\t"
594 "mov %%rsi, %c[rsi](%[svm]) \n\t"
595 "mov %%rdi, %c[rdi](%[svm]) \n\t"
596 "mov %%rbp, %c[rbp](%[svm]) \n\t"
597 "mov %%r8, %c[r8](%[svm]) \n\t"
598 "mov %%r9, %c[r9](%[svm]) \n\t"
599 "mov %%r10, %c[r10](%[svm]) \n\t"
600 "mov %%r11, %c[r11](%[svm]) \n\t"
601 "mov %%r12, %c[r12](%[svm]) \n\t"
602 "mov %%r13, %c[r13](%[svm]) \n\t"
603 "mov %%r14, %c[r14](%[svm]) \n\t"
604 "mov %%r15, %c[r15](%[svm]) \n\t"
605 "pop %%rbp"
607 : [svm]"a"(vcpu),
608 [vmcb]"i"(offsetof(struct vcpu, vmcb_pa)),
609 [rbx]"i"(offsetof(struct vcpu, regs[VCPU_REGS_RBX])),
610 [rcx]"i"(offsetof(struct vcpu, regs[VCPU_REGS_RCX])),
611 [rdx]"i"(offsetof(struct vcpu, regs[VCPU_REGS_RDX])),
612 [rsi]"i"(offsetof(struct vcpu, regs[VCPU_REGS_RSI])),
613 [rdi]"i"(offsetof(struct vcpu, regs[VCPU_REGS_RDI])),
614 [rbp]"i"(offsetof(struct vcpu, regs[VCPU_REGS_RBP])),
615 [r8 ]"i"(offsetof(struct vcpu, regs[VCPU_REGS_R8 ])),
616 [r9 ]"i"(offsetof(struct vcpu, regs[VCPU_REGS_R9 ])),
617 [r10]"i"(offsetof(struct vcpu, regs[VCPU_REGS_R10])),
618 [r11]"i"(offsetof(struct vcpu, regs[VCPU_REGS_R11])),
619 [r12]"i"(offsetof(struct vcpu, regs[VCPU_REGS_R12])),
620 [r13]"i"(offsetof(struct vcpu, regs[VCPU_REGS_R13])),
621 [r14]"i"(offsetof(struct vcpu, regs[VCPU_REGS_R14])),
622 [r15]"i"(offsetof(struct vcpu, regs[VCPU_REGS_R15]))
623 : "cc", "memory",
624 "rbx", "rcx", "rdx", "rsi", "rdi",
625 "r8", "r9", "r10", "r11" , "r12", "r13", "r14", "r15"
629 /* meh: dr7? db_regs? */
631 vcpu->cr2 = vmcb->save.cr2;
633 vcpu->regs[VCPU_REGS_RAX] = vmcb->save.rax;
634 vcpu->regs[VCPU_REGS_RSP] = vmcb->save.rsp;
635 vcpu->regs[VCPU_REGS_RIP] = vmcb->save.rip;
637 load_dr6(host_dr6);
638 load_dr7(host_dr7);
640 load_cr2(host_cr2);
642 load_fs(fs_selector);
643 load_gs(gs_selector);
644 lldt(ldt_selector);
646 wrmsr(MSR_FSBASE, vcpu->host_fs_base);
647 wrmsr(MSR_GSBASE, vcpu->host_gs_base);
649 tss_desc->sd_type = SDT_SYSTSS;
650 ltr(sel);
652 wrmsr(MSR_STAR, star);
653 wrmsr(MSR_LSTAR, lstar);
654 wrmsr(MSR_CSTAR, cstar);
655 wrmsr(MSR_SF_MASK, sfmask);
657 // disable_intr();
659 __asm __volatile (SVM_STGI);
661 // printf("STGI\n");
663 // print_tss_desc(tss_desc);
664 // print_tss(tss_desc);
666 // print_vmcb_save_area(vmcb);
668 // enable_intr();
670 /* meh: next_rip */
673 static void
674 _fkvm_init_seg(struct vmcb_seg *seg, uint16_t attrib)
676 seg->selector = 0;
677 seg->attrib = VMCB_SELECTOR_P_MASK | attrib;
678 seg->limit = 0xffff;
679 seg->base = 0;
682 static inline void
683 fkvm_init_seg(struct vmcb_seg *seg)
685 _fkvm_init_seg(seg, VMCB_SELECTOR_S_MASK | VMCB_SELECTOR_WRITE_MASK);
688 static inline void
689 fkvm_init_sys_seg(struct vmcb_seg *seg, uint16_t attrib)
691 _fkvm_init_seg(seg, attrib);
694 static void*
695 fkvm_iopm_alloc(void)
697 return contigmalloc(IOPM_SIZE, M_DEVBUF, 0, 0, -1UL, PAGE_SIZE, 0);
700 static void
701 fkvm_iopm_init(void *iopm)
703 memset(iopm, 0xff, IOPM_SIZE); /* TODO: we may want to allow access to PC debug port */
706 static void
707 fkvm_iopm_free(void *iopm)
709 contigfree(iopm, IOPM_SIZE, M_DEVBUF);
712 static void*
713 fkvm_msrpm_alloc(void)
715 return contigmalloc(MSRPM_SIZE, M_DEVBUF, 0, 0, -1UL, PAGE_SIZE, 0);
718 static void
719 fkvm_msrpm_init(void *msrpm)
721 memset(msrpm, 0xff, MSRPM_SIZE); /* TODO: we may want to allow some MSR accesses */
724 static void
725 fkvm_msrpm_free(void *msrpm)
727 contigfree(msrpm, MSRPM_SIZE, M_DEVBUF);
730 static void*
731 fkvm_hsave_area_alloc(void)
733 return contigmalloc(PAGE_SIZE, M_DEVBUF, 0, 0, -1UL, PAGE_SIZE, 0);
736 static void
737 fkvm_hsave_area_init(void *hsave_area)
741 static void
742 fkvm_hsave_area_free(void *hsave_area)
744 contigfree(hsave_area, PAGE_SIZE, M_DEVBUF);
747 static struct vmspace*
748 fkvm_make_vmspace(void)
750 struct vmspace *sp;
752 sp = vmspace_alloc(0, 0xffffffffffffffff);
753 if (sp == NULL) {
754 printf("vmspace_alloc failed\n");
755 return NULL;
758 return sp;
761 static void
762 fkvm_destroy_vmspace(struct vmspace* sp)
764 vmspace_free(sp);
767 static struct vmcb*
768 fkvm_vmcb_alloc(void)
770 return contigmalloc(PAGE_SIZE, M_DEVBUF, M_ZERO, 0, -1UL,
771 PAGE_SIZE, 0);
774 static void
775 fkvm_vmcb_init(struct vmcb *vmcb)
777 struct vmcb_control_area *control = &vmcb->control;
778 struct vmcb_save_area *save = &vmcb->save;
780 control->intercept_cr_reads = INTERCEPT_CR4_MASK;
782 control->intercept_cr_writes = INTERCEPT_CR4_MASK |
783 INTERCEPT_CR8_MASK;
785 control->intercept_dr_reads = INTERCEPT_DR0_MASK |
786 INTERCEPT_DR1_MASK |
787 INTERCEPT_DR2_MASK |
788 INTERCEPT_DR3_MASK;
790 control->intercept_dr_writes = INTERCEPT_DR0_MASK |
791 INTERCEPT_DR1_MASK |
792 INTERCEPT_DR2_MASK |
793 INTERCEPT_DR3_MASK |
794 INTERCEPT_DR5_MASK |
795 INTERCEPT_DR7_MASK;
797 control->intercept_exceptions = (1 << IDT_UD) | // Invalid Opcode
798 (1 << IDT_MC); // Machine Check
800 control->intercepts = INTERCEPT_INTR |
801 INTERCEPT_NMI |
802 INTERCEPT_SMI |
803 INTERCEPT_CPUID |
804 INTERCEPT_INVD |
805 INTERCEPT_HLT |
806 INTERCEPT_INVLPGA |
807 INTERCEPT_IOIO_PROT |
808 INTERCEPT_MSR_PROT |
809 INTERCEPT_SHUTDOWN |
810 INTERCEPT_VMRUN |
811 INTERCEPT_VMMCALL |
812 INTERCEPT_VMLOAD |
813 INTERCEPT_VMSAVE |
814 INTERCEPT_STGI |
815 INTERCEPT_CLGI |
816 INTERCEPT_SKINIT |
817 INTERCEPT_WBINVD |
818 INTERCEPT_MONITOR |
819 INTERCEPT_MWAIT_UNCOND;
821 control->iopm_base_pa = vtophys(iopm);
822 control->msrpm_base_pa = vtophys(msrpm);
823 control->tsc_offset = 0;
825 /* TODO: remove this once we assign asid's to distinct VM's */
826 control->guest_asid = 1;
827 control->tlb_control = VMCB_TLB_CONTROL_FLUSH_ALL;
829 /* let v_tpr default to 0 */
830 /* let v_irq default to 0 */
831 /* let v_intr default to 0 */
833 control->v_intr_masking = 1;
835 /* let v_intr_vector default to 0 */
836 /* let intr_shadow default to 0 */
837 /* let exit_code, exit_info_1, exit_info_2, exit_int_info,
838 exit_int_info_err_code default to 0 */
840 control->nested_ctl = 1;
842 /* let event_inj default to 0 */
844 // (nested_cr3 is later)
846 /* let lbr_virt_enable default to 0 */
849 fkvm_init_seg(&save->ds);
850 fkvm_init_seg(&save->es);
851 fkvm_init_seg(&save->fs);
852 fkvm_init_seg(&save->gs);
853 fkvm_init_seg(&save->ss);
855 _fkvm_init_seg(&save->cs, VMCB_SELECTOR_READ_MASK | VMCB_SELECTOR_S_MASK |
856 VMCB_SELECTOR_CODE_MASK);
857 save->cs.selector = 0xf000;
858 save->cs.base = 0xffff0000;
860 save->gdtr.limit = 0xffff;
861 save->idtr.limit = 0xffff;
863 fkvm_init_sys_seg(&save->ldtr, SDT_SYSLDT);
864 fkvm_init_sys_seg(&save->tr, SDT_SYS286BSY);
866 save->g_pat = PAT_VALUE(PAT_WRITE_BACK, 0) | PAT_VALUE(PAT_WRITE_THROUGH, 1) |
867 PAT_VALUE(PAT_UNCACHED, 2) | PAT_VALUE(PAT_UNCACHEABLE, 3) |
868 PAT_VALUE(PAT_WRITE_BACK, 4) | PAT_VALUE(PAT_WRITE_THROUGH, 5) |
869 PAT_VALUE(PAT_UNCACHED, 6) | PAT_VALUE(PAT_UNCACHEABLE, 7);
871 /* CR0 = 6000_0010h at boot */
872 save->cr0 = CR0_ET | CR0_NW | CR0_CD;
873 save->dr6 = 0xffff0ff0;
874 save->dr7 = 0x400;
875 save->rflags = 2;
876 save->rip = 0x0000fff0;
878 save->efer = EFER_SVME;
881 static void
882 fkvm_vmcb_free(struct vmcb *vmcb)
884 contigfree(vmcb, PAGE_SIZE, M_DEVBUF);
887 static struct vcpu*
888 fkvm_vcpu_create(struct guestvm *guest_vm)
890 struct vcpu *vcpu;
891 vcpu = malloc(sizeof(struct vcpu), M_DEVBUF, M_WAITOK|M_ZERO);
893 vcpu->vmcb = fkvm_vmcb_alloc();
894 vcpu->vmcb_pa = vtophys(vcpu->vmcb);
895 printf("vmcb = 0x%p\n", vcpu->vmcb);
896 printf("vcpu->vmcb_pa = 0x%lx\n", vcpu->vmcb_pa);
898 fkvm_vmcb_init(vcpu->vmcb);
899 vcpu->vmcb->control.nested_cr3 = guest_vm->nested_cr3;
900 vcpu->regs[VCPU_REGS_RIP] = vcpu->vmcb->save.rip;
902 vcpu->guest_vm = guest_vm;
904 return vcpu;
907 static void
908 fkvm_vcpu_destroy(struct vcpu *vcpu)
910 fkvm_vmcb_free(vcpu->vmcb);
911 free(vcpu, M_DEVBUF);
914 static struct guestvm*
915 fkvm_guestvm_alloc(void)
917 return malloc(sizeof(struct guestvm), M_DEVBUF, M_WAITOK|M_ZERO);
920 static void
921 fkvm_guestvm_free(struct guestvm* guest_vm)
923 free(guest_vm, M_DEVBUF);
926 static void
927 fkvm_guestvm_add_vcpu(struct guestvm *guest_vm, struct vcpu *vcpu)
929 guest_vm->vcpus[guest_vm->nr_vcpus] = vcpu;
930 guest_vm->nr_vcpus++; /* TODO: Probably not safe to increment */
931 /* How about a lock to protect all of this? */
936 fkvm_userpoke(struct thread *td, struct fkvm_userpoke_args *uap)
938 printf("fkvm_userpoke\n");
940 if (!fkvm_loaded)
941 return ENODEV;
943 return ENOSYS;
946 static int
947 fkvm_mem_has_entry(vm_map_entry_t expected_entry, vm_map_t vm_map, vm_offset_t vaddr)
949 vm_map_entry_t lookup_entry;
950 vm_object_t throwaway_object;
951 vm_pindex_t throwaway_pindex;
952 vm_prot_t throwaway_prot;
953 boolean_t throwaway_wired;
954 int error;
956 error = vm_map_lookup(&vm_map, /* IN/OUT */
957 vaddr,
958 VM_PROT_READ|VM_PROT_WRITE,
959 &lookup_entry, /* OUT */
960 &throwaway_object, /* OUT */
961 &throwaway_pindex, /* OUT */
962 &throwaway_prot, /* OUT */
963 &throwaway_wired); /* OUT */
964 if (error != KERN_SUCCESS)
965 return 0;
966 vm_map_lookup_done(vm_map, lookup_entry);
967 return (lookup_entry == expected_entry);
970 static int
971 fkvm_guest_check_range(struct guestvm *guest_vm, uint64_t start, uint64_t end)
973 vm_map_t guest_vm_map;
974 vm_map_entry_t lookup_entry;
975 vm_object_t throwaway_object;
976 vm_pindex_t throwaway_pindex;
977 vm_prot_t throwaway_prot;
978 boolean_t throwaway_wired;
979 int ret;
980 int error;
982 guest_vm_map = &guest_vm->sp->vm_map;
984 error = vm_map_lookup(&guest_vm_map, /* IN/OUT */
985 start,
986 VM_PROT_READ|VM_PROT_WRITE,
987 &lookup_entry, /* OUT */
988 &throwaway_object, /* OUT */
989 &throwaway_pindex, /* OUT */
990 &throwaway_prot, /* OUT */
991 &throwaway_wired); /* OUT */
992 if (error != KERN_SUCCESS)
993 return EFAULT;
994 vm_map_lookup_done(guest_vm_map, lookup_entry);
997 TODO: We can't actually nest the lookups:
998 panic: _sx_xlock_hard: recursed on non-recursive sx user map @ ../../../vm/vm_map.c:3115
999 Therefore, I've moved the lookup_done above for now, but we really need a lock here.
1001 Maybe it's better to use vm_map_lookup_entry directly.
1005 if (fkvm_mem_has_entry(lookup_entry, guest_vm_map, end))
1006 ret = 0;
1007 else
1008 ret = EFAULT;
1010 return ret;
1013 static void
1014 fkvm_get_regs_regs(struct vcpu *vcpu, struct kvm_regs *out)
1016 out->rax = vcpu->regs[VCPU_REGS_RAX];
1017 out->rbx = vcpu->regs[VCPU_REGS_RBX];
1018 out->rcx = vcpu->regs[VCPU_REGS_RCX];
1019 out->rdx = vcpu->regs[VCPU_REGS_RDX];
1020 out->rsi = vcpu->regs[VCPU_REGS_RSI];
1021 out->rdi = vcpu->regs[VCPU_REGS_RDI];
1022 out->rsp = vcpu->regs[VCPU_REGS_RSP];
1023 out->rbp = vcpu->regs[VCPU_REGS_RBP];
1024 out->r8 = vcpu->regs[VCPU_REGS_R8];
1025 out->r9 = vcpu->regs[VCPU_REGS_R9];
1026 out->r10 = vcpu->regs[VCPU_REGS_R10];
1027 out->r11 = vcpu->regs[VCPU_REGS_R11];
1028 out->r12 = vcpu->regs[VCPU_REGS_R12];
1029 out->r13 = vcpu->regs[VCPU_REGS_R13];
1030 out->r14 = vcpu->regs[VCPU_REGS_R14];
1031 out->r15 = vcpu->regs[VCPU_REGS_R15];
1032 out->rip = vcpu->regs[VCPU_REGS_RIP];
1033 out->rflags = vcpu->vmcb->save.rflags;
1036 static void
1037 fkvm_set_regs_regs(struct vcpu *vcpu, const struct kvm_regs *in)
1039 vcpu->regs[VCPU_REGS_RAX] = in->rax;
1040 vcpu->regs[VCPU_REGS_RBX] = in->rbx;
1041 vcpu->regs[VCPU_REGS_RCX] = in->rcx;
1042 vcpu->regs[VCPU_REGS_RDX] = in->rdx;
1043 vcpu->regs[VCPU_REGS_RSI] = in->rsi;
1044 vcpu->regs[VCPU_REGS_RDI] = in->rdi;
1045 vcpu->regs[VCPU_REGS_RSP] = in->rsp;
1046 vcpu->regs[VCPU_REGS_RBP] = in->rbp;
1047 vcpu->regs[VCPU_REGS_R8] = in->r8;
1048 vcpu->regs[VCPU_REGS_R9] = in->r9;
1049 vcpu->regs[VCPU_REGS_R10] = in->r10;
1050 vcpu->regs[VCPU_REGS_R11] = in->r11;
1051 vcpu->regs[VCPU_REGS_R12] = in->r12;
1052 vcpu->regs[VCPU_REGS_R13] = in->r13;
1053 vcpu->regs[VCPU_REGS_R14] = in->r14;
1054 vcpu->regs[VCPU_REGS_R15] = in->r15;
1055 vcpu->regs[VCPU_REGS_RIP] = in->rip;
1056 vcpu->vmcb->save.rflags = in->rflags;
1059 static void
1060 fkvm_get_vmcb_dtable(struct vmcb_seg *vmcb_seg, struct kvm_dtable *fkvm_dtable)
1062 fkvm_dtable->base = vmcb_seg->base;
1063 fkvm_dtable->limit = vmcb_seg->limit;
1066 static void
1067 fkvm_set_vmcb_dtable(struct vmcb_seg *vmcb_seg, struct kvm_dtable *fkvm_dtable)
1069 vmcb_seg->base = fkvm_dtable->base;
1070 vmcb_seg->limit = fkvm_dtable->limit;
1073 static void
1074 fkvm_get_vmcb_seg(struct vmcb_seg *vmcb_seg, struct kvm_segment *fkvm_seg)
1076 fkvm_seg->base = vmcb_seg->base;
1077 fkvm_seg->limit = vmcb_seg->limit;
1078 fkvm_seg->selector = vmcb_seg->selector;
1080 if (vmcb_seg->attrib == 0)
1081 fkvm_seg->unusable = 1;
1082 else {
1083 fkvm_seg->type = (vmcb_seg->attrib & VMCB_SELECTOR_TYPE_MASK);
1084 fkvm_seg->s = (vmcb_seg->attrib & VMCB_SELECTOR_S_MASK) >> VMCB_SELECTOR_S_SHIFT;
1085 fkvm_seg->dpl = (vmcb_seg->attrib & VMCB_SELECTOR_DPL_MASK) >> VMCB_SELECTOR_DPL_SHIFT;
1086 fkvm_seg->present = (vmcb_seg->attrib & VMCB_SELECTOR_P_MASK) >> VMCB_SELECTOR_P_SHIFT;
1087 fkvm_seg->avl = (vmcb_seg->attrib & VMCB_SELECTOR_AVL_MASK) >> VMCB_SELECTOR_AVL_SHIFT;
1088 fkvm_seg->l = (vmcb_seg->attrib & VMCB_SELECTOR_L_MASK) >> VMCB_SELECTOR_L_SHIFT;
1089 fkvm_seg->db = (vmcb_seg->attrib & VMCB_SELECTOR_DB_MASK) >> VMCB_SELECTOR_DB_SHIFT;
1090 fkvm_seg->g = (vmcb_seg->attrib & VMCB_SELECTOR_G_MASK) >> VMCB_SELECTOR_G_SHIFT;
1094 static void
1095 fkvm_set_vmcb_seg(struct vmcb_seg *vmcb_seg, struct kvm_segment *fkvm_seg)
1097 vmcb_seg->base = fkvm_seg->base;
1098 vmcb_seg->limit = fkvm_seg->limit;
1099 vmcb_seg->selector = fkvm_seg->selector;
1101 if (fkvm_seg->unusable)
1102 vmcb_seg->attrib=0;
1103 else {
1104 vmcb_seg->attrib = (fkvm_seg->type & VMCB_SELECTOR_TYPE_MASK);
1105 vmcb_seg->attrib |= (fkvm_seg->s & 1) << VMCB_SELECTOR_S_SHIFT;
1106 vmcb_seg->attrib |= (fkvm_seg->dpl & 3) << VMCB_SELECTOR_DPL_SHIFT;
1107 vmcb_seg->attrib |= (fkvm_seg->present & 1) << VMCB_SELECTOR_P_SHIFT;
1108 vmcb_seg->attrib |= (fkvm_seg->avl & 1) << VMCB_SELECTOR_AVL_SHIFT;
1109 vmcb_seg->attrib |= (fkvm_seg->l & 1) << VMCB_SELECTOR_L_SHIFT;
1110 vmcb_seg->attrib |= (fkvm_seg->db & 1) << VMCB_SELECTOR_DB_SHIFT;
1111 vmcb_seg->attrib |= (fkvm_seg->g & 1) << VMCB_SELECTOR_G_SHIFT;
1115 static uint64_t
1116 fkvm_get_cr8(struct vcpu *vcpu)
1118 // TODO: if cr8 has reserved bits inject GP Fault, return
1120 return (uint64_t) vcpu->vmcb->control.v_tpr;
1123 static void
1124 fkvm_set_cr8(struct vcpu *vcpu, uint64_t cr8)
1126 // TODO: if cr8 has reserved bits inject GP Fault, return
1128 vcpu->vmcb->control.v_tpr = (uint8_t) cr8;
1131 static uint64_t
1132 fkvm_get_efer(struct vcpu *vcpu)
1134 struct vmcb *vmcb = vcpu->vmcb;
1136 return vmcb->save.efer & (~EFER_SVME);
1139 static void
1140 fkvm_set_efer(struct vcpu *vcpu, uint64_t efer)
1142 struct vmcb *vmcb = vcpu->vmcb;
1143 //TODO: if efer has reserved bits set: inject GP Fault
1145 if (vmcb->save.cr0 & CR0_PG) { //If paging is enabled do not allow changes to LME
1146 if ((vmcb->save.efer & EFER_LME) != (efer & EFER_LME)) {
1147 printf("fkvm_set_efer: attempt to change LME while paging\n");
1148 //TODO: inject GP fault
1152 vmcb->save.efer = efer | EFER_SVME;
1155 static void
1156 fkvm_get_regs_sregs(struct vcpu *vcpu, struct kvm_sregs *out)
1158 struct vmcb *vmcb = vcpu->vmcb;
1160 fkvm_get_vmcb_seg(&vmcb->save.cs, &out->cs);
1161 fkvm_get_vmcb_seg(&vmcb->save.ds, &out->ds);
1162 fkvm_get_vmcb_seg(&vmcb->save.es, &out->es);
1163 fkvm_get_vmcb_seg(&vmcb->save.fs, &out->fs);
1164 fkvm_get_vmcb_seg(&vmcb->save.gs, &out->gs);
1165 fkvm_get_vmcb_seg(&vmcb->save.ss, &out->ss);
1166 fkvm_get_vmcb_seg(&vmcb->save.tr, &out->tr);
1167 fkvm_get_vmcb_seg(&vmcb->save.ldtr, &out->ldt);
1169 fkvm_get_vmcb_dtable(&vmcb->save.idtr, &out->idt);
1170 fkvm_get_vmcb_dtable(&vmcb->save.gdtr, &out->gdt);
1172 out->cr2 = vcpu->cr2;
1173 out->cr3 = vcpu->cr3;
1175 out->cr8 = fkvm_get_cr8(vcpu);
1176 out->efer = fkvm_get_efer(vcpu);
1177 /* TODO: apic_base */
1178 out->cr0 = vmcb->save.cr0;
1179 out->cr4 = vmcb->save.cr4;
1180 /* TODO: irq_pending, interrupt_bitmap, irq_summary */
1183 static void
1184 fkvm_set_regs_sregs(struct vcpu *vcpu, struct kvm_sregs *in)
1186 struct vmcb *vmcb = vcpu->vmcb;
1188 fkvm_set_vmcb_seg(&vmcb->save.cs, &in->cs);
1189 fkvm_set_vmcb_seg(&vmcb->save.ds, &in->ds);
1190 fkvm_set_vmcb_seg(&vmcb->save.es, &in->es);
1191 fkvm_set_vmcb_seg(&vmcb->save.fs, &in->fs);
1192 fkvm_set_vmcb_seg(&vmcb->save.gs, &in->gs);
1193 fkvm_set_vmcb_seg(&vmcb->save.ss, &in->ss);
1194 fkvm_set_vmcb_seg(&vmcb->save.tr, &in->tr);
1195 fkvm_set_vmcb_seg(&vmcb->save.ldtr, &in->ldt);
1197 vmcb->save.cpl = (vmcb->save.cs.attrib >> VMCB_SELECTOR_DPL_SHIFT) & 3;
1199 fkvm_set_vmcb_dtable(&vmcb->save.idtr, &in->idt);
1200 fkvm_set_vmcb_dtable(&vmcb->save.gdtr, &in->gdt);
1202 vcpu->cr2 = in->cr2;
1203 vcpu->cr3 = in->cr3;
1205 fkvm_set_cr8(vcpu, in->cr8);
1206 fkvm_set_efer(vcpu, in->efer);
1207 /* TODO: apic_base */
1208 vmcb->save.cr0 = in->cr0;
1209 vmcb->save.cr4 = in->cr4;
1210 /* TODO: irq_pending, interrupt_bitmap, irq_summary */
1213 static int
1214 fkvm_get_reg_msr(struct vcpu *vcpu, uint32_t index, uint64_t *data) {
1215 struct vmcb *vmcb = vcpu->vmcb;
1217 switch(index) {
1219 case MSR_TSC: {
1220 uint64_t tsc;
1222 tsc = rdtsc();
1223 *data = vmcb->control.tsc_offset + tsc;
1224 break;
1227 case MSR_STAR: {
1228 *data = vmcb->save.star;
1229 break;
1232 case MSR_LSTAR: {
1233 *data = vmcb->save.lstar;
1234 break;
1237 case MSR_CSTAR: {
1238 *data = vmcb->save.cstar;
1239 break;
1242 case MSR_GSBASE: {
1243 *data = vmcb->save.kernel_gs_base;
1244 break;
1247 case MSR_SF_MASK: {
1248 *data = vmcb->save.sfmask;
1249 break;
1252 case MSR_SYSENTER_CS_MSR: {
1253 *data = vmcb->save.sysenter_cs;
1254 break;
1257 case MSR_SYSENTER_EIP_MSR: {
1258 *data = vmcb->save.sysenter_eip;
1259 break;
1262 case MSR_SYSENTER_ESP_MSR: {
1263 *data = vmcb->save.sysenter_esp;
1264 break;
1267 case MSR_DEBUGCTLMSR: {
1268 printf("unimplemented at %d\n", __LINE__);
1269 return ENOSYS;
1270 break;
1273 case MSR_PERFEVSEL0 ... MSR_PERFEVSEL3:
1274 case MSR_PERFCTR0 ... MSR_PERFCTR3: {
1275 printf("unimplemented at %d\n", __LINE__);
1276 return ENOSYS;
1277 break;
1280 case MSR_EFER: {
1281 *data = fkvm_get_efer(vcpu);
1282 break;
1285 case MSR_MC0_STATUS: {
1286 printf("unimplemented at %d\n", __LINE__);
1287 return ENOSYS;
1288 break;
1291 case MSR_MCG_STATUS: {
1292 printf("unimplemented at %d\n", __LINE__);
1293 return ENOSYS;
1294 break;
1297 case MSR_MCG_CTL: {
1298 printf("unimplemented at %d\n", __LINE__);
1299 return ENOSYS;
1300 break;
1303 //TODO: MSR_IA32_UCODE_REV
1304 //TODO: MSR_IA32_UCODE_WRITE
1306 case MSR_MTRRcap: {
1307 *data = MTRR_CAP_WC | MTRR_CAP_FIXED | FKVM_MTRR_NVAR;
1308 break;
1311 case MSR_MTRRdefType: {
1312 *data = vcpu->mtrrs.default_type;
1313 break;
1316 case MSR_MTRR64kBase ... (MSR_MTRR64kBase + MTRR_N64K - 1): {
1317 *data = vcpu->mtrrs.mtrr64k[index - MSR_MTRR64kBase];
1318 break;
1321 case MSR_MTRR16kBase ... (MSR_MTRR16kBase + MTRR_N16K - 1): {
1322 *data = vcpu->mtrrs.mtrr16k[index - MSR_MTRR16kBase];
1323 break;
1326 case MSR_MTRR4kBase ... (MSR_MTRR4kBase + MTRR_N4K - 1): {
1327 *data = vcpu->mtrrs.mtrr4k[index - MSR_MTRR4kBase];
1328 break;
1331 case MSR_MTRRVarBase ... (MSR_MTRRVarBase + FKVM_MTRR_NVAR * 2 - 1): {
1332 *data = vcpu->mtrrs.mtrrvar[index - MSR_MTRRVarBase];
1333 break;
1336 case MSR_APICBASE: {
1337 printf("unimplemented at %d\n", __LINE__);
1338 return ENOSYS;
1339 break;
1342 case MSR_IA32_MISC_ENABLE: {
1343 printf("unimplemented at %d\n", __LINE__);
1344 return ENOSYS;
1345 break;
1348 //TODO: MSR_KVM_WALL_CLOCK
1349 //TODO: MSR_KVM_SYSTEM_TIME
1351 default:
1352 printf("Did not get unimplemented msr: 0x%" PRIx32 "\n", index);
1353 return ENOSYS;
1356 return 0;
1359 static void
1360 fkvm_get_regs_msrs(struct vcpu *vcpu, uint32_t nmsrs, struct kvm_msr_entry *entries) {
1361 int i;
1363 for (i = 0; i < nmsrs; i++) {
1364 fkvm_get_reg_msr(vcpu, entries[i].index, &entries[i].data);
1368 static int
1369 fkvm_set_reg_msr(struct vcpu *vcpu, uint32_t index, uint64_t data) {
1370 struct vmcb *vmcb = vcpu->vmcb;
1372 switch(index) {
1374 case MSR_TSC: {
1375 uint64_t tsc;
1377 tsc = rdtsc();
1378 vmcb->control.tsc_offset = data - tsc;
1379 break;
1382 case MSR_STAR: {
1383 vmcb->save.star = data;
1384 break;
1387 case MSR_LSTAR: {
1388 vmcb->save.lstar = data;
1389 break;
1392 case MSR_CSTAR: {
1393 vmcb->save.cstar = data;
1394 break;
1397 case MSR_GSBASE: {
1398 vmcb->save.kernel_gs_base = data;
1399 break;
1402 case MSR_SF_MASK: {
1403 vmcb->save.sfmask = data;
1404 break;
1407 case MSR_SYSENTER_CS_MSR: {
1408 vmcb->save.sysenter_cs = data;
1409 break;
1412 case MSR_SYSENTER_EIP_MSR: {
1413 vmcb->save.sysenter_eip = data;
1414 break;
1417 case MSR_SYSENTER_ESP_MSR: {
1418 vmcb->save.sysenter_esp = data;
1419 break;
1422 case MSR_DEBUGCTLMSR: {
1423 printf("unimplemented at %d\n", __LINE__);
1424 return ENOSYS;
1425 break;
1428 case MSR_PERFEVSEL0 ... MSR_PERFEVSEL3:
1429 case MSR_PERFCTR0 ... MSR_PERFCTR3: {
1430 printf("unimplemented at %d\n", __LINE__);
1431 return ENOSYS;
1432 break;
1435 case MSR_EFER: {
1436 fkvm_set_efer(vcpu, data);
1437 break;
1440 case MSR_MC0_STATUS: {
1441 printf("unimplemented at %d\n", __LINE__);
1442 return ENOSYS;
1443 break;
1446 case MSR_MCG_STATUS: {
1447 printf("unimplemented at %d\n", __LINE__);
1448 return ENOSYS;
1449 break;
1452 case MSR_MCG_CTL: {
1453 printf("unimplemented at %d\n", __LINE__);
1454 return ENOSYS;
1455 break;
1458 //TODO: MSR_IA32_UCODE_REV
1459 //TODO: MSR_IA32_UCODE_WRITE
1461 case MSR_MTRRdefType: {
1462 vcpu->mtrrs.default_type = data;
1463 break;
1466 case MSR_MTRR64kBase ... (MSR_MTRR64kBase + MTRR_N64K - 1): {
1467 vcpu->mtrrs.mtrr64k[index - MSR_MTRR64kBase] = data;
1468 break;
1471 case MSR_MTRR16kBase ... (MSR_MTRR16kBase + MTRR_N16K - 1): {
1472 vcpu->mtrrs.mtrr16k[index - MSR_MTRR16kBase] = data;
1473 break;
1476 case MSR_MTRR4kBase ... (MSR_MTRR4kBase + MTRR_N4K - 1): {
1477 vcpu->mtrrs.mtrr4k[index - MSR_MTRR4kBase] = data;
1478 break;
1481 case MSR_MTRRVarBase ... (MSR_MTRRVarBase + FKVM_MTRR_NVAR * 2 - 1): {
1482 vcpu->mtrrs.mtrrvar[index - MSR_MTRRVarBase] = data;
1483 break;
1486 case MSR_APICBASE: {
1487 printf("unimplemented at %d\n", __LINE__);
1488 return ENOSYS;
1489 break;
1492 case MSR_IA32_MISC_ENABLE: {
1493 printf("unimplemented at %d\n", __LINE__);
1494 return ENOSYS;
1495 break;
1498 //TODO: MSR_KVM_WALL_CLOCK
1499 //TODO: MSR_KVM_SYSTEM_TIME
1501 default:
1502 printf("Did not set unimplemented msr: 0x%" PRIx32 "\n", index);
1503 return ENOSYS;
1506 return 0;
1509 static void
1510 fkvm_set_regs_msrs(struct vcpu *vcpu, uint32_t nmsrs, struct kvm_msr_entry *entries) {
1511 int i;
1513 for (i = 0; i < nmsrs; i++) {
1514 fkvm_set_reg_msr(vcpu, entries[i].index, entries[i].data);
1518 /* System Calls */
1521 fkvm_get_regs(struct thread *td, struct fkvm_get_regs_args *uap)
1523 struct vcpu *vcpu;
1524 int error;
1526 if (!fkvm_loaded)
1527 return ENODEV;
1529 vcpu = TD_GET_VCPU(td);
1530 if (vcpu == NULL)
1531 return ENODEV;
1533 switch (uap->type) {
1535 case FKVM_REGS_TYPE_REGS: {
1536 struct kvm_regs out;
1537 fkvm_get_regs_regs(vcpu, &out);
1538 return copyout(&out, uap->regs, sizeof(out));
1541 case FKVM_REGS_TYPE_SREGS: {
1542 struct kvm_sregs out;
1543 fkvm_get_regs_sregs(vcpu, &out);
1544 return copyout(&out, uap->regs, sizeof(out));
1547 case FKVM_REGS_TYPE_MSRS: {
1548 struct kvm_msr_entry *user_entries;
1549 struct kvm_msr_entry *entries;
1550 int size;
1552 user_entries = (struct kvm_msr_entry *)uap->regs;
1554 size = sizeof(*entries) * uap->n;
1555 entries = malloc(size, M_DEVBUF, M_WAITOK|M_ZERO);
1556 if (entries == NULL)
1557 return ENOMEM;
1559 error = copyin(user_entries, entries, size);
1560 if (error != 0) {
1561 printf("FKVM_REGS_TYPE_MSRS: unable to copyin entries\n");
1562 free(entries, M_DEVBUF);
1563 return error;
1566 fkvm_get_regs_msrs(vcpu, uap->n, entries);
1568 error = copyout(user_entries, entries, size);
1569 if (error != 0) {
1570 printf("FKVM_REGS_TYPE_MSRS: unable to copyout entries\n");
1573 free(entries, M_DEVBUF);
1574 return error;
1577 default:
1578 return EINVAL;
1583 fkvm_set_regs(struct thread *td, struct fkvm_set_regs_args *uap)
1585 struct vcpu *vcpu;
1586 int error = 0;
1588 vcpu = TD_GET_VCPU(td);
1589 if (vcpu == NULL)
1590 return ENODEV;
1592 switch (uap->type) {
1594 case FKVM_REGS_TYPE_REGS: {
1595 struct kvm_regs in;
1596 error = copyin(uap->regs, &in, sizeof(in));
1597 if (error != 0)
1598 return error;
1599 fkvm_set_regs_regs(vcpu, &in);
1600 return 0;
1603 case FKVM_REGS_TYPE_SREGS: {
1604 struct kvm_sregs in;
1605 error = copyin(uap->regs, &in, sizeof(in));
1606 if (error != 0)
1607 return error;
1608 fkvm_set_regs_sregs(vcpu, &in);
1609 return 0;
1612 case FKVM_REGS_TYPE_MSRS: {
1613 struct kvm_msr_entry *user_entries;
1614 struct kvm_msr_entry *entries;
1615 int size;
1617 user_entries = (struct kvm_msr_entry *)uap->regs;
1619 size = sizeof(*entries) * uap->n;
1620 entries = malloc(size, M_DEVBUF, M_WAITOK|M_ZERO);
1621 if (entries == NULL)
1622 return ENOMEM;
1624 error = copyin(user_entries, entries, size);
1625 if (error != 0) {
1626 printf("FKVM_REGS_TYPE_MSRS: unable to copyin entries\n");
1627 free(entries, M_DEVBUF);
1628 return error;
1631 fkvm_set_regs_msrs(vcpu, uap->n, entries);
1633 free(entries, M_DEVBUF);
1634 return error;
1637 default:
1638 return EINVAL;
1642 /* This function can only be called with multiples of page sizes */
1643 /* vaddr as NULL overloads to fkvm_guest_check_range */
1645 fkvm_set_user_mem_region(struct thread *td, struct fkvm_set_user_mem_region_args *uap)
1647 struct guestvm *guest_vm;
1649 vm_offset_t start;
1650 vm_offset_t end;
1652 struct vmspace *user_vm_space;
1653 vm_map_t user_vm_map;
1655 vm_object_t vm_object;
1656 vm_pindex_t vm_object_pindex;
1657 vm_ooffset_t vm_object_offset;
1658 vm_prot_t throwaway_prot;
1659 boolean_t throwaway_wired;
1660 vm_map_entry_t lookup_entry;
1662 int error;
1664 guest_vm = PROC_GET_GUESTVM(td->td_proc);
1665 if (guest_vm == NULL) {
1666 printf("PROC_GET_GUESTVM -> NULL\n");
1667 return ENODEV;
1670 start = uap->guest_pa;
1671 end = uap->guest_pa + uap->size - 1;
1672 printf("start: 0x%" PRIx64 " bytes\n", start);
1673 printf("end: 0x%" PRIx64 " bytes\n", end);
1675 if (uap->vaddr == 0)
1676 return fkvm_guest_check_range(guest_vm, start, end);
1678 user_vm_space = td->td_proc->p_vmspace;
1679 user_vm_map = &user_vm_space->vm_map;
1680 printf("user vm space: %p\n", user_vm_space);
1681 printf("user vm map: %p\n", user_vm_map);
1683 error = vm_map_lookup(&user_vm_map, /* IN/OUT */
1684 uap->vaddr,
1685 VM_PROT_READ|VM_PROT_WRITE,
1686 &lookup_entry, /* OUT */
1687 &vm_object, /* OUT */
1688 &vm_object_pindex, /* OUT */
1689 &throwaway_prot, /* OUT */
1690 &throwaway_wired); /* OUT */
1691 if (error != KERN_SUCCESS) {
1692 printf("vm_map_lookup failed: %d\n", error);
1693 return EFAULT;
1696 /* TODO: Trust the user that the full region is valid.
1697 * This is very bad. See the note in fkvm_guest_check_range
1698 * on nesting vm lookups. */
1699 #if 0
1700 if (!fkvm_mem_has_entry(lookup_entry, user_vm_map, uap->vaddr + uap->size)) {
1701 printf("end of range not contained in same vm map entry as start\n");
1702 return EFAULT;
1704 #endif
1706 printf("vm object: %p\n", vm_object);
1707 printf(" size: %d pages\n", (int) vm_object->size);
1709 vm_object_offset = IDX_TO_OFF(vm_object_pindex);
1710 printf("vm_ooffset: 0x%" PRIx64 "\n", vm_object_offset);
1712 vm_object_reference(vm_object); // TODO: this might be a mem leak
1714 vm_map_lookup_done(user_vm_map, lookup_entry);
1716 error = vm_map_insert(&guest_vm->sp->vm_map,
1717 vm_object,
1718 vm_object_offset,
1719 start,
1720 end,
1721 VM_PROT_ALL, VM_PROT_ALL,
1723 if (error != KERN_SUCCESS) {
1724 printf("vm_map_insert failed: %d\n", error);
1725 switch (error) {
1726 case KERN_INVALID_ADDRESS:
1727 return EINVAL;
1728 case KERN_NO_SPACE:
1729 return ENOMEM;
1730 default:
1731 return 1;
1735 return 0;
1739 fkvm_unset_user_mem_region(struct thread *td, struct fkvm_unset_user_mem_region_args *uap)
1741 struct guestvm *guest_vm;
1743 if (!fkvm_loaded)
1744 return ENODEV;
1746 guest_vm = PROC_GET_GUESTVM(td->td_proc);
1747 if (guest_vm == NULL) {
1748 printf("PROC_GET_GUESTVM -> NULL\n");
1749 return ENODEV;
1752 vm_offset_t start;
1753 vm_offset_t end;
1755 vm_map_t guest_vm_map;
1757 int error;
1759 start = uap->guest_pa;
1760 end = uap->guest_pa + uap->size - 1;
1761 printf("start: 0x%" PRIx64 " bytes\n", start);
1762 printf("end: 0x%" PRIx64 " bytes\n", end);
1764 guest_vm_map = &guest_vm->sp->vm_map;
1766 error = vm_map_remove(guest_vm_map, start, end);
1767 if (error != KERN_SUCCESS)
1768 return -1;
1770 return 0;
1774 fkvm_create_vm(struct thread *td, struct fkvm_create_vm_args *uap)
1776 struct guestvm *guest_vm;
1778 printf("SYSCALL : fkvm_create_vm\n");
1780 if (!fkvm_loaded)
1781 return ENODEV;
1783 /* Allocate Guest VM */
1784 guest_vm = fkvm_guestvm_alloc();
1786 /* Set up the vm address space */
1787 guest_vm->sp = fkvm_make_vmspace();
1788 if (guest_vm->sp == NULL) {
1789 fkvm_guestvm_free(guest_vm);
1790 return ENOMEM;
1792 guest_vm->nested_cr3 = vtophys(vmspace_pmap(guest_vm->sp)->pm_pml4);
1794 printf("guest:\n");
1795 printf(" vm space: %p\n", guest_vm->sp);
1796 printf(" vm map: %p\n", &guest_vm->sp->vm_map);
1797 printf(" ncr3: 0x%" PRIx64 "\n", guest_vm->nested_cr3);
1799 PROC_SET_GUESTVM(td->td_proc, guest_vm);
1801 printf("fkvm_create_vm done\n");
1802 return 0;
1805 static void
1806 fkvm_destroy_vm(struct guestvm *guest_vm)
1808 /* Destroy the VCPUs */
1809 while (guest_vm->nr_vcpus > 0) {
1810 guest_vm->nr_vcpus--;
1811 fkvm_vcpu_destroy(guest_vm->vcpus[guest_vm->nr_vcpus]);
1812 guest_vm->vcpus[guest_vm->nr_vcpus] = NULL;
1815 /* Destroy the vmspace */
1816 if (guest_vm->sp != NULL)
1817 fkvm_destroy_vmspace(guest_vm->sp);
1819 /* Destroy the Guest VM itself */
1820 fkvm_guestvm_free(guest_vm);
1823 static int
1824 intercept_ioio(struct vcpu *vcpu, struct kvm_run *kvm_run, uint64_t ioio_info, uint64_t rip)
1826 struct vmcb *vmcb = vcpu->vmcb;
1828 kvm_run->u.io.string = (ioio_info & STR_MASK) >> STR_SHIFT;
1830 kvm_run->u.io.port = ioio_info >> PORT_SHIFT;
1831 kvm_run->u.io.in = ioio_info & TYPE_MASK;
1833 kvm_run->u.io.size = (ioio_info & SIZE_MASK) >> SIZE_SHIFT;
1835 /* We need to remove the Interrupt Shadow Flag from the VMCB (see 15.20.5 in AMD_Vol2) */
1836 vmcb->control.intr_shadow = 0;
1838 kvm_run->u.io.rep = (ioio_info & REP_MASK) >> REP_SHIFT;
1839 /* TODO: Research more into Direction Flag checked in KVM; DF bit in RFLAGS */
1841 /* set the next rip in the VMCB save area for now */
1842 /* TODO: Store rIP in vm_run structure until we absolutely need it */
1843 vcpu->regs[VCPU_REGS_RIP] = rip;
1845 return 0;
1848 static void
1849 intercept_shutdown(struct vcpu *vcpu)
1851 struct vmcb *vmcb = vcpu->vmcb;
1852 memset(vmcb, 0, PAGE_SIZE);
1853 fkvm_vmcb_init(vmcb);
1857 fkvm_vm_run(struct thread *td, struct fkvm_vm_run_args *uap)
1859 struct vcpu *vcpu;
1860 struct guestvm *guest_vm;
1861 struct vmcb *vmcb;
1862 int error;
1863 int ret = 0;
1864 int num_runs = 0;
1865 struct kvm_run kvm_run;
1867 if (!fkvm_loaded)
1868 return ENODEV;
1870 vcpu = TD_GET_VCPU(td);
1871 if (vcpu == NULL)
1872 return ENODEV;
1874 guest_vm = vcpu->guest_vm;
1875 vmcb = vcpu->vmcb;
1877 error = copyin(uap->run, &kvm_run, sizeof(struct kvm_run));
1878 if (error != 0)
1879 return error;
1881 fkvm_set_cr8(vcpu, kvm_run.cr8);
1883 kvm_run.exit_reason = KVM_EXIT_CONTINUE;
1885 while(kvm_run.exit_reason == KVM_EXIT_CONTINUE) {
1886 fkvm_vcpu_run(vcpu);
1888 switch (vmcb->control.exit_code) {
1890 case VMCB_EXIT_EXCP_BASE ... (VMCB_EXIT_EXCP_BASE + 31): {
1891 int excp_vector;
1893 excp_vector = vmcb->control.exit_code - VMCB_EXIT_EXCP_BASE;
1895 printf("VMCB_EXIT_EXCP_BASE, exception vector: 0x%x\n",
1896 excp_vector);
1897 kvm_run.exit_reason = KVM_EXIT_UNKNOWN;
1898 ret = ENOSYS;
1899 break;
1902 case VMCB_EXIT_INTR: {
1903 printf("VMCB_EXIT_INTR - nothing to do\n");
1904 /* Handled by host OS already */
1905 kvm_run.exit_reason = KVM_EXIT_CONTINUE;
1906 break;
1909 case VMCB_EXIT_NPF: {
1910 /* EXITINFO1 contains fault error code */
1911 /* EXITINFO2 contains the guest physical address causing the fault. */
1913 u_int64_t fault_code;
1914 u_int64_t fault_gpa;
1916 vm_prot_t fault_type;
1917 int fault_flags;
1918 int rc;
1920 fault_code = vmcb->control.exit_info_1;
1921 fault_gpa = vmcb->control.exit_info_2;
1922 kvm_run.exit_reason = KVM_EXIT_CONTINUE;
1924 #if 0
1925 printf("VMCB_EXIT_NPF:\n");
1926 printf("gpa=0x%" PRIx64 "\n", fault_gpa);
1927 printf("fault code=0x%" PRIx64 " [P=%x, R/W=%x, U/S=%x, I/D=%x]\n",
1928 fault_code,
1929 (fault_code & PGEX_P) != 0,
1930 (fault_code & PGEX_W) != 0,
1931 (fault_code & PGEX_U) != 0,
1932 (fault_code & PGEX_I) != 0);
1933 #endif
1934 if (fault_code & PGEX_W)
1935 fault_type = VM_PROT_WRITE;
1936 else if (fault_code & PGEX_I)
1937 fault_type = VM_PROT_EXECUTE;
1938 else
1939 fault_type = VM_PROT_READ;
1941 fault_flags = 0; /* TODO: is that right? */
1942 rc = vm_fault(&guest_vm->sp->vm_map, (fault_gpa & (~PAGE_MASK)), fault_type, fault_flags);
1943 if (rc != KERN_SUCCESS) {
1944 printf("vm_fault failed: %d\n", rc);
1945 kvm_run.u.mmio.fault_gpa = fault_gpa;
1946 kvm_run.u.mmio.rip = vcpu->regs[VCPU_REGS_RIP];
1947 kvm_run.u.mmio.cs_base = vmcb->save.cs.base;
1948 kvm_run.exit_reason = KVM_EXIT_MMIO;
1951 break;
1953 case VMCB_EXIT_WRITE_CR8:
1954 kvm_run.exit_reason = KVM_EXIT_SET_TPR;
1955 break;
1956 case VMCB_EXIT_NMI:
1957 kvm_run.exit_reason = KVM_EXIT_NMI;
1958 break;
1959 case VMCB_EXIT_HLT:
1960 kvm_run.exit_reason = KVM_EXIT_HLT;
1961 break;
1962 case VMCB_EXIT_SHUTDOWN:
1963 intercept_shutdown(vcpu);
1964 kvm_run.exit_reason = KVM_EXIT_SHUTDOWN;
1965 break;
1966 case VMCB_EXIT_IOIO:
1967 error = intercept_ioio(vcpu, &kvm_run,
1968 vmcb->control.exit_info_1,
1969 vmcb->control.exit_info_2);
1970 if (error)
1971 kvm_run.exit_reason = KVM_EXIT_UNKNOWN;
1972 else
1973 kvm_run.exit_reason = KVM_EXIT_IO;
1974 break;
1975 case VMCB_EXIT_MSR: {
1976 int wrmsr;
1977 uint32_t msr;
1978 union {
1979 struct {
1980 uint32_t low;
1981 uint32_t high;
1982 } split;
1983 uint64_t full;
1984 } value;
1986 wrmsr = vmcb->control.exit_info_1;
1987 msr = (uint32_t) vcpu->regs[VCPU_REGS_RCX];
1989 printf("VMCB_EXIT_MSR:\n"
1990 " %s msr 0x%" PRIx64 "\n",
1991 wrmsr ? "write to" : "read from",
1992 vcpu->regs[VCPU_REGS_RCX]);
1994 if (!wrmsr) { /* rdmsr */
1995 error = fkvm_get_reg_msr(vcpu, msr, &value.full);
1996 if (error != 0) {
1997 ret = ENOSYS;
1998 kvm_run.exit_reason = KVM_EXIT_UNKNOWN;
1999 break;
2002 vcpu->regs[VCPU_REGS_RDX] = (uint64_t) value.split.high;
2003 vcpu->regs[VCPU_REGS_RAX] = (uint64_t) value.split.low;
2005 else { /* wrmsr */
2006 value.split.high = (uint32_t) vcpu->regs[VCPU_REGS_RDX];
2007 value.split.low = (uint32_t) vcpu->regs[VCPU_REGS_RAX];
2009 error = fkvm_set_reg_msr(vcpu, msr, value.full);
2010 if (error != 0) {
2011 ret = ENOSYS;
2012 kvm_run.exit_reason = KVM_EXIT_UNKNOWN;
2013 break;
2018 vcpu->regs[VCPU_REGS_RIP] += 2;
2019 break;
2021 case VMCB_EXIT_CPUID: {
2022 kvm_run.u.cpuid.fn = (uint32_t) vcpu->regs[VCPU_REGS_RAX];
2023 kvm_run.exit_reason = KVM_EXIT_CPUID;
2024 break;
2026 case VMCB_EXIT_WBINVD: {
2027 /* TODO: stop ignoring this intercept when we have more than 1-cpu guests */
2028 vcpu->regs[VCPU_REGS_RIP] += 2;
2029 break;
2031 case VMCB_EXIT_READ_CR0:
2032 case VMCB_EXIT_READ_CR3:
2033 case VMCB_EXIT_READ_CR4:
2034 case VMCB_EXIT_READ_CR8:
2035 case VMCB_EXIT_WRITE_CR0:
2036 case VMCB_EXIT_WRITE_CR3:
2037 case VMCB_EXIT_WRITE_CR4:
2038 case VMCB_EXIT_READ_DR0:
2039 case VMCB_EXIT_READ_DR1:
2040 case VMCB_EXIT_READ_DR2:
2041 case VMCB_EXIT_READ_DR3:
2042 case VMCB_EXIT_WRITE_DR0:
2043 case VMCB_EXIT_WRITE_DR1:
2044 case VMCB_EXIT_WRITE_DR2:
2045 case VMCB_EXIT_WRITE_DR3:
2046 case VMCB_EXIT_WRITE_DR5:
2047 case VMCB_EXIT_WRITE_DR7:
2048 case VMCB_EXIT_SMI:
2049 case VMCB_EXIT_INIT:
2050 case VMCB_EXIT_VINTR:
2051 case VMCB_EXIT_CR0_SEL_WRITE:
2052 case VMCB_EXIT_INVD:
2053 case VMCB_EXIT_INVLPG:
2054 case VMCB_EXIT_INVLPGA:
2055 case VMCB_EXIT_TASK_SWITCH:
2056 case VMCB_EXIT_VMRUN:
2057 case VMCB_EXIT_VMMCALL:
2058 case VMCB_EXIT_VMLOAD:
2059 case VMCB_EXIT_VMSAVE:
2060 case VMCB_EXIT_STGI:
2061 case VMCB_EXIT_CLGI:
2062 case VMCB_EXIT_SKINIT:
2063 case VMCB_EXIT_MONITOR:
2064 case VMCB_EXIT_MWAIT_UNCOND:
2065 default:
2066 printf("Unhandled vmexit:\n"
2067 " code: 0x%" PRIx64 "\n"
2068 " info1: 0x%" PRIx64 "\n"
2069 " info2: 0x%" PRIx64 "\n",
2070 vmcb->control.exit_code,
2071 vmcb->control.exit_info_1,
2072 vmcb->control.exit_info_2);
2073 print_vmcb(vmcb);
2074 ret = ENOSYS;
2075 kvm_run.exit_reason = KVM_EXIT_UNKNOWN;
2078 num_runs++;
2079 if (num_runs == 20) //TODO: make this a #define
2080 break;
2083 // printf("\n\n");
2085 /* we're going up to userspace - set the out fields of kvm_run: */
2087 #define IF_MASK 0x00000200
2088 kvm_run.if_flag = !!(vcpu->vmcb->save.rflags & IF_MASK);
2090 /* TODO: kvm adds a check to see if in-kernel interrupt queues are empty */
2091 kvm_run.ready_for_interrupt_injection = kvm_run.if_flag &&
2092 !vcpu->vmcb->control.intr_shadow;
2094 /* TODO kvm_run.ready_for_nmi_injection = ...; */
2096 kvm_run.cr8 = fkvm_get_cr8(vcpu);
2099 /* TODO: check copyout ret val */
2100 copyout(&kvm_run, uap->run, sizeof(struct kvm_run));
2101 // printf("sizeof(struct kvm_run) = %" PRIu64 "\n", sizeof(struct kvm_run));
2103 return ret;
2107 fkvm_create_vcpu(struct thread *td, struct fkvm_create_vcpu_args *uap)
2109 struct guestvm *guest_vm;
2110 struct vcpu *vcpu;
2112 if (!fkvm_loaded)
2113 return ENODEV;
2115 guest_vm = PROC_GET_GUESTVM(td->td_proc);
2116 if (guest_vm == NULL) {
2117 printf("PROC_GET_GUESTVM -> NULL\n");
2118 return ENODEV;
2121 /* Allocate VCPU */
2122 printf("fkvm_create_vcpu: td = %p\n", td);
2123 vcpu = fkvm_vcpu_create(guest_vm);
2124 fkvm_guestvm_add_vcpu(guest_vm, vcpu);
2126 TD_SET_VCPU(td, vcpu);
2127 printf("fkvm_create_vcpu: vcpu = %p\n", vcpu);
2128 return 0;
2131 static int
2132 fkvm_check_cpu_extension(void)
2134 u_int cpu_exthigh;
2135 u_int regs[4];
2136 u_int64_t vmcr;
2138 printf("fkvm_check_cpu_extension\n");
2140 /* Assumption: the architecture supports the cpuid instruction */
2142 /* Check if CPUID extended function 8000_0001h is supported. */
2143 do_cpuid(0x80000000, regs);
2144 cpu_exthigh = regs[0];
2146 printf("cpu_exthigh = %u\n", cpu_exthigh);
2148 if(cpu_exthigh >= 0x80000001) {
2149 /* Execute CPUID extended function 8000_0001h */
2150 do_cpuid(0x80000001, regs);
2151 printf("EAX = %u\n", regs[0]);
2153 if((regs[0] & 0x2) == 0) { /* Check SVM bit */
2154 printf("SVM not available\n");
2155 goto fail; /* SVM not available */
2158 vmcr = rdmsr(0xc0010114); /* Read VM_CR MSR */
2159 if((vmcr & 0x8) == 0) { /* Check SVMDIS bit */
2160 printf("vmcr = %" PRIx64 "\n", vmcr);
2161 printf("SVM allowed\n");
2162 return KERN_SUCCESS; /* SVM allowed */
2165 /* Execute CPUID extended function 8000_000ah */
2166 do_cpuid(0x8000000a, regs);
2167 if((regs[3] & 0x2) == 0) { /* Check SVM_LOCK bit */
2168 /* SVM disabled at bios; not unlockable.
2169 * User must change a BIOS setting to enable SVM.
2171 printf("EDX = %u\n", regs[3]);
2172 printf("SVM disabled at bios\n");
2173 goto fail;
2174 } else {
2175 /* TODO:
2176 * SVM may be unlockable;
2177 * consult the BIOS or TPM to obtain the key.
2179 printf("EDX = %u\n", regs[3]);
2180 printf("SVM maybe unlockable\n");
2181 goto fail;
2184 fail:
2185 return KERN_FAILURE;
2188 static void
2189 fkvm_proc_exit(void *arg, struct proc *p)
2191 struct guestvm *guest_vm;
2193 guest_vm = PROC_GET_GUESTVM(p);
2194 if (guest_vm == NULL)
2195 return;
2197 fkvm_destroy_vm(guest_vm);
2198 PROC_SET_GUESTVM(p, NULL);
2201 static void
2202 fkvm_load(void *unused)
2204 u_int64_t efer;
2205 int error;
2207 printf("fkvm_load\n");
2208 printf("sizeof(struct vmcb) = %" PRIx64 "\n", sizeof(struct vmcb));
2210 hsave_area = NULL;
2211 iopm = NULL;
2212 msrpm = NULL;
2214 /* check if SVM is supported */
2215 error = fkvm_check_cpu_extension();
2216 if(error != KERN_SUCCESS) {
2217 printf("ERROR: SVM extension not available\n");
2218 return;
2221 exit_tag = EVENTHANDLER_REGISTER(process_exit, fkvm_proc_exit, NULL,
2222 EVENTHANDLER_PRI_ANY);
2224 /* allocate structures */
2225 hsave_area = fkvm_hsave_area_alloc();
2226 iopm = fkvm_iopm_alloc();
2227 msrpm = fkvm_msrpm_alloc();
2229 /* Initialize structures */
2230 fkvm_hsave_area_init(hsave_area);
2231 fkvm_iopm_init(iopm);
2232 fkvm_msrpm_init(msrpm);
2234 /* Enable SVM in EFER */
2235 efer = rdmsr(MSR_EFER);
2236 printf("EFER = %" PRIx64 "\n", efer);
2237 wrmsr(MSR_EFER, efer | EFER_SVME);
2238 efer = rdmsr(MSR_EFER);
2239 printf("new EFER = %" PRIx64 "\n", efer);
2241 /* Write Host save address in MSR_VM_HSAVE_PA */
2242 wrmsr(MSR_VM_HSAVE_PA, vtophys(hsave_area));
2244 fkvm_loaded = 1;
2246 SYSINIT(fkvm, SI_SUB_PSEUDO, SI_ORDER_MIDDLE, fkvm_load, NULL);
2248 static void
2249 fkvm_unload(void *unused)
2251 printf("fkvm_unload\n");
2253 if (!fkvm_loaded) {
2254 printf("fkvm_unload: fkvm not loaded");
2255 return;
2258 EVENTHANDLER_DEREGISTER(process_exit, exit_tag);
2260 if (msrpm != NULL) {
2261 fkvm_msrpm_free(iopm);
2262 msrpm = NULL;
2264 if (iopm != NULL) {
2265 fkvm_iopm_free(iopm);
2266 iopm = NULL;
2268 if (hsave_area != NULL) {
2269 fkvm_hsave_area_free(hsave_area);
2270 hsave_area = NULL;
2273 SYSUNINIT(fkvm, SI_SUB_PSEUDO, SI_ORDER_MIDDLE, fkvm_unload, NULL);