remove kvm's R macro
[freebsd-src/fkvm-freebsd.git] / sys / kern / kern_fkvm.c
blobddc7fb8f7e7e64e7f7a43c9a0509152ed59f6d0a
1 /*-
2 * Copyright (c) 2008 The FreeBSD Project
3 * All rights reserved.
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
27 #include <sys/cdefs.h>
28 #include <sys/param.h>
29 #include <sys/systm.h>
30 #include <sys/kernel.h>
31 #include <sys/malloc.h>
32 #include <sys/sysproto.h>
33 #include <sys/file.h>
34 #include <sys/mman.h>
35 #include <sys/proc.h>
36 #include <vm/vm.h>
37 #include <vm/pmap.h>
38 #include <vm/vm_extern.h>
39 #include <vm/vm_map.h>
40 #include <vm/vm_object.h>
41 #include <vm/vm_param.h>
42 #include <machine/_inttypes.h>
43 #include <machine/specialreg.h>
44 #include <machine/segments.h>
45 #include <machine/vmcb.h>
47 #define IOPM_SIZE (8*1024 + 1) /* TODO: ensure that this need not be 12Kbtes, not just 8Kb+1 */
48 #define MSRPM_SIZE (8*1024)
50 /* fkvm data */
51 static void *iopm = NULL; /* Should I allocate a vm_object_t instead? */
52 static void *msrpm = NULL; /* Should I allocate a vm_object_t instead? */
54 static void *hsave_area = NULL;
56 /* per-guest data */
58 enum {
59 VCPU_REGS_RAX = 0,
60 VCPU_REGS_RCX = 1,
61 VCPU_REGS_RDX = 2,
62 VCPU_REGS_RBX = 3,
63 VCPU_REGS_RSP = 4,
64 VCPU_REGS_RBP = 5,
65 VCPU_REGS_RSI = 6,
66 VCPU_REGS_RDI = 7,
67 VCPU_REGS_R8 = 8,
68 VCPU_REGS_R9 = 9,
69 VCPU_REGS_R10 = 10,
70 VCPU_REGS_R11 = 11,
71 VCPU_REGS_R12 = 12,
72 VCPU_REGS_R13 = 13,
73 VCPU_REGS_R14 = 14,
74 VCPU_REGS_R15 = 15,
75 VCPU_REGS_RIP,
76 NR_VCPU_REGS
79 struct vcpu {
80 /* VCPU data */
81 struct vmcb *vmcb;
82 unsigned long vmcb_pa;
84 unsigned long regs[NR_VCPU_REGS];
85 u_int64_t host_gs_base;
86 u_int64_t cr2;
87 u_int64_t cr3;
89 struct guestvm *guest_vm;
92 #define MAX_VCPUS 8
94 struct guestvm {
95 struct vcpu *vcpus[MAX_VCPUS];
96 int nr_vcpus;
98 struct vmspace *sp;
99 vm_object_t vm_obj;
101 u_int64_t nested_cr3;
104 #define SVM_VMLOAD ".byte 0x0f, 0x01, 0xda"
105 #define SVM_VMRUN ".byte 0x0f, 0x01, 0xd8"
106 #define SVM_VMSAVE ".byte 0x0f, 0x01, 0xdb"
107 #define SVM_CLGI ".byte 0x0f, 0x01, 0xdd"
108 #define SVM_STGI ".byte 0x0f, 0x01, 0xdc"
109 #define SVM_INVLPGA ".byte 0x0f, 0x01, 0xdf"
111 #define GET_GUESTVM(thread) (struct guestvm *)((struct vcpu *)TD_GET_VCPU(thread))->guest_vm
113 static void
114 print_vmcb_seg(struct vmcb_seg* vmcb_seg, const char* name)
116 printf("%s Selector\n", name);
117 printf("Selector : %" PRIx16 "\n", vmcb_seg->selector);
118 printf("Attributes : %" PRIx16 "\n", vmcb_seg->attrib);
119 printf("Limit : %" PRIx32 "\n", vmcb_seg->limit);
120 printf("Base Address : %" PRIx64 "\n", vmcb_seg->base);
121 printf("\n");
124 static void
125 print_vmcb(struct vmcb *vmcb)
127 printf("VMCB Control Area\n");
128 printf("Intercept CR Reads : %" PRIx16 "\n", vmcb->control.intercept_cr_reads);
129 printf("Intercept CR Writes : %" PRIx16 "\n", vmcb->control.intercept_cr_writes);
130 printf("Intercept DR Reads : %" PRIx16 "\n", vmcb->control.intercept_dr_reads);
131 printf("Intercept DR Writes : %" PRIx16 "\n", vmcb->control.intercept_dr_writes);
132 printf("Intercept Exceptions : %" PRIx32 "\n", vmcb->control.intercept_exceptions);
133 printf("Intercepts : %" PRIx64 "\n", vmcb->control.intercepts);
134 printf("Reserved 1: \n");
135 for(int i=0; i < 44; i++) {
136 printf("%" PRIx8 "", vmcb->control.reserved_1[i]); /* Should be Zero */
138 printf("\n");
139 printf("IOPM Base PA : %" PRIx64 "\n", vmcb->control.iopm_base_pa);
140 printf("MSRPM Base PA : %" PRIx64 "\n", vmcb->control.msrpm_base_pa);
141 printf("TSC Offset : %" PRIx64 "\n", vmcb->control.tsc_offset);
142 printf("Guest ASID : %" PRIx32 "\n", vmcb->control.guest_asid);
143 printf("TLB Control : %" PRIx8 "\n", vmcb->control.tlb_control);
144 printf("Reserved 2 : \n");
145 for(int i=0; i < 3; i++) {
146 printf("%" PRIx8 "", vmcb->control.reserved_1[i]); /* Should be Zero */
148 printf("\n");
149 printf("Virtual TPR : %" PRIx8 "\n", vmcb->control.v_tpr);
150 printf("Virtual IRQ : %" PRIx8 "\n", vmcb->control.v_irq);
151 printf("Virtual Interrupt : %" PRIx8 "\n", vmcb->control.v_intr);
152 printf("Virtual Interrupt Masking: %" PRIx8 "\n", vmcb->control.v_intr_masking);
153 printf("Virtual Interrupt Vector : %" PRIx8 "\n", vmcb->control.v_intr_vector);
154 printf("Reserved 6 : \n");
155 for(int i=0; i < 3; i++) {
156 printf("%" PRIx8 "", vmcb->control.reserved_6[i]); /* Should be Zero */
158 printf("\n");
159 printf("Interrupt Shadow : %" PRIx8 "\n", vmcb->control.intr_shadow);
160 printf("Reserved 7 : \n");
161 for(int i=0; i < 7; i++) {
162 printf("%" PRIx8 "", vmcb->control.reserved_7[i]); /* Should be Zero */
164 printf("\n");
165 printf("Exit Code : %" PRIx64 "\n", vmcb->control.exit_code);
166 printf("Exit Info 1 : %" PRIx64 "\n", vmcb->control.exit_info_1);
167 printf("Exit Info 2 : %" PRIx64 "\n", vmcb->control.exit_info_2);
168 printf("Exit Interrupt Info : %" PRIx32 "\n", vmcb->control.exit_int_info);
169 printf("Exit Interrupt Info Err Code: %" PRIx32 "\n", vmcb->control.exit_int_info_err_code);
170 printf("Nested Control : %" PRIx64 "\n", vmcb->control.nested_ctl);
171 printf("Reserved 8 : \n");
172 for(int i=0; i < 16; i++) {
173 printf("%" PRIx8 "", vmcb->control.reserved_8[i]); /* Should be Zero */
175 printf("\n");
176 printf("Event Injection : %" PRIx64 "\n", vmcb->control.event_inj);
177 printf("Nested CR3 : %" PRIx64 "\n", vmcb->control.nested_cr3);
178 printf("LBR Virtualization Enable: %" PRIx64 "\n", vmcb->control.lbr_virt_enable);
179 printf("Reserved 9 : \n");
180 for(int i=0; i < 832; i++) {
181 printf("%" PRIx8 "", vmcb->control.reserved_9[i]); /* Should be Zero */
183 printf("\n");
185 printf("\n");
187 printf("VMCB Save Area\n");
188 print_vmcb_seg(&(vmcb->save.es), "ES");
189 print_vmcb_seg(&(vmcb->save.es), "CS");
190 print_vmcb_seg(&(vmcb->save.es), "SS");
191 print_vmcb_seg(&(vmcb->save.es), "DS");
192 print_vmcb_seg(&(vmcb->save.es), "FS");
193 print_vmcb_seg(&(vmcb->save.es), "GS");
194 print_vmcb_seg(&(vmcb->save.es), "GDTR");
195 print_vmcb_seg(&(vmcb->save.es), "LDTR");
196 print_vmcb_seg(&(vmcb->save.es), "IDTR");
197 print_vmcb_seg(&(vmcb->save.es), "TR");
198 printf("Reserved 1 : \n");
199 for(int i=0; i < 43; i++) {
200 printf("%" PRIx8 "", vmcb->save.reserved_1[i]); /* Should be Zero */
202 printf("\n");
203 printf("Current Processor Level : %" PRIx8 "\n", vmcb->save.cpl);
204 printf("Reserved 2 : \n");
205 for(int i=0; i < 4; i++) {
206 printf("%" PRIx8 "", vmcb->save.reserved_2[i]); /* Should be Zero */
208 printf("\n");
209 printf("EFER : %" PRIx64 "\n", vmcb->save.efer);
210 printf("Reserved 3 : \n");
211 for(int i=0; i < 112; i++) {
212 printf("%" PRIx8 "", vmcb->save.reserved_3[i]); /* Should be Zero */
214 printf("\n");
215 printf("Control Register 4 : %" PRIx64 "\n", vmcb->save.cr4);
216 printf("Control Register 3 : %" PRIx64 "\n", vmcb->save.cr3);
217 printf("Control Register 0 : %" PRIx64 "\n", vmcb->save.cr0);
218 printf("Debug Register 7 : %" PRIx64 "\n", vmcb->save.dr7);
219 printf("Debug Register 6 : %" PRIx64 "\n", vmcb->save.dr6);
220 printf("RFlags : %" PRIx64 "\n", vmcb->save.rflags);
221 printf("RIP : %" PRIx64 "\n", vmcb->save.rip);
222 printf("Reserved 4 : \n");
223 for(int i=0; i < 88; i++) {
224 printf("%" PRIx8 "", vmcb->save.reserved_4[i]); /* Should be Zero */
226 printf("\n");
227 printf("RSP : %" PRIx64 "\n", vmcb->save.rsp);
228 printf("Reserved 5 : \n");
229 for(int i=0; i < 24; i++) {
230 printf("%" PRIx8 "", vmcb->save.reserved_5[i]); /* Should be Zero */
232 printf("\n");
233 printf("RAX : %" PRIx64 "\n", vmcb->save.rax);
234 printf("STAR : %" PRIx64 "\n", vmcb->save.star);
235 printf("LSTAR : %" PRIx64 "\n", vmcb->save.lstar);
236 printf("CSTAR : %" PRIx64 "\n", vmcb->save.cstar);
237 printf("SFMASK : %" PRIx64 "\n", vmcb->save.sfmask);
238 printf("Kernel GS Base : %" PRIx64 "\n", vmcb->save.kernel_gs_base);
239 printf("SYSENTER CS : %" PRIx64 "\n", vmcb->save.sysenter_cs);
240 printf("SYSENTER ESP : %" PRIx64 "\n", vmcb->save.sysenter_esp);
241 printf("SYSENTER EIP : %" PRIx64 "\n", vmcb->save.sysenter_eip);
242 printf("Control Register 2 : %" PRIx64 "\n", vmcb->save.cr2);
243 printf("Reserved 6 : \n");
244 for(int i=0; i < 32; i++) {
245 printf("%" PRIx8 "", vmcb->save.reserved_6[i]); /* Should be Zero */
247 printf("\n");
248 printf("Global PAT : %" PRIx64 "\n", vmcb->save.g_pat);
249 printf("Debug Control : %" PRIx64 "\n", vmcb->save.dbg_ctl);
250 printf("BR From : %" PRIx64 "\n", vmcb->save.br_from);
251 printf("BR To : %" PRIx64 "\n", vmcb->save.br_to);
252 printf("Last Exception From : %" PRIx64 "\n", vmcb->save.last_excp_from);
253 printf("Last Exception To : %" PRIx64 "\n", vmcb->save.last_excp_to);
255 printf("\n\n");
258 static void
259 print_tss_desc(struct system_segment_descriptor *tss_desc)
261 printf("TSS desc @ %p:\n", tss_desc);
262 printf("sd_lolimit: 0x%" PRIx64 "\n", (u_int64_t) tss_desc->sd_lolimit);
263 printf("sd_lobase: 0x%" PRIx64 "\n", (u_int64_t) tss_desc->sd_lobase);
264 printf("sd_type: 0x%" PRIx64 "\n", (u_int64_t) tss_desc->sd_type);
265 printf("sd_dpl: 0x%" PRIx64 "\n", (u_int64_t) tss_desc->sd_dpl);
266 printf("sd_p: 0x%" PRIx64 "\n", (u_int64_t) tss_desc->sd_p);
267 printf("sd_hilimit: 0x%" PRIx64 "\n", (u_int64_t) tss_desc->sd_hilimit);
268 printf("sd_xx0: 0x%" PRIx64 "\n", (u_int64_t) tss_desc->sd_xx0);
269 printf("sd_gran: 0x%" PRIx64 "\n", (u_int64_t) tss_desc->sd_gran);
270 printf("sd_hibase: 0x%" PRIx64 "\n", (u_int64_t) tss_desc->sd_hibase);
271 printf("sd_xx1: 0x%" PRIx64 "\n", (u_int64_t) tss_desc->sd_xx1);
272 printf("sd_mbz: 0x%" PRIx64 "\n", (u_int64_t) tss_desc->sd_mbz);
273 printf("sd_xx2: 0x%" PRIx64 "\n", (u_int64_t) tss_desc->sd_xx2);
274 printf("\n\n");
277 static void
278 print_tss(struct system_segment_descriptor *tss_desc)
280 u_int32_t *base;
281 int limit;
282 int i;
284 base = (u_int32_t*) ((((u_int64_t) tss_desc->sd_hibase) << 24) | ((u_int64_t) tss_desc->sd_lobase));
285 limit = ((tss_desc->sd_hilimit << 16) | tss_desc->sd_lolimit) / 4;
287 printf("TSS: @ %p\n", base);
288 for (i = 0; i <= limit; i++)
289 printf("%x: 0x%" PRIx32 "\n", i, base[i]);
290 printf("\n\n");
293 static void
294 print_vmcb_save_area(struct vmcb *vmcb)
296 printf("VMCB save area:\n");
297 printf("fs: [selector %" PRIx16 ", attrib %" PRIx16 ", limit %" PRIx32 ", base %" PRIx64 "]\n",
298 vmcb->save.fs.selector,
299 vmcb->save.fs.attrib,
300 vmcb->save.fs.limit,
301 vmcb->save.fs.base);
302 printf("gs: [selector %" PRIx16 ", attrib %" PRIx16 ", limit %" PRIx32 ", base %" PRIx64 "]\n",
303 vmcb->save.gs.selector,
304 vmcb->save.gs.attrib,
305 vmcb->save.gs.limit,
306 vmcb->save.gs.base);
307 printf("tr: [selector %" PRIx16 ", attrib %" PRIx16 ", limit %" PRIx32 ", base %" PRIx64 "]\n",
308 vmcb->save.tr.selector,
309 vmcb->save.tr.attrib,
310 vmcb->save.tr.limit,
311 vmcb->save.tr.base);
312 printf("ldtr: [selector %" PRIx16 ", attrib %" PRIx16 ", limit %" PRIx32 ", base %" PRIx64 "]\n",
313 vmcb->save.ldtr.selector,
314 vmcb->save.ldtr.attrib,
315 vmcb->save.ldtr.limit,
316 vmcb->save.ldtr.base);
317 printf("kernel_gs_base: %" PRIx64 "\n", vmcb->save.kernel_gs_base);
318 printf("star: %" PRIx64 "\n", vmcb->save.star);
319 printf("lstar: %" PRIx64 "\n", vmcb->save.lstar);
320 printf("cstar: %" PRIx64 "\n", vmcb->save.cstar);
321 printf("sfmask: %" PRIx64 "\n", vmcb->save.sfmask);
322 printf("sysenter_cs: %" PRIx64 "\n", vmcb->save.sysenter_cs);
323 printf("sysenter_esp: %" PRIx64 "\n", vmcb->save.sysenter_esp);
324 printf("sysenter_eip: %" PRIx64 "\n", vmcb->save.sysenter_eip);
325 printf("\n\n");
328 static int
329 vmrun_assert(struct vmcb *vmcb)
331 #define A(cond) do { if ((cond)) { printf("Error: assertion not met on line %d\n", __LINE__); bad = 1; } } while (0)
333 int bad;
335 bad = 0;
337 // The following are illegal:
339 //EFER.SVME is zero.
340 A((vmcb->save.efer & 0x0000000000001000) == 0);
342 // CR0.CD is zero and CR0.NW is set
343 A( ((vmcb->save.cr0 & 0x0000000040000000) == 0) &&
344 ((vmcb->save.cr0 & 0x0000000020000000) != 0));
346 // CR0[63:32] are not zero.
347 A((vmcb->save.cr0 & 0xFFFFFFFF00000000) == 0xFFFFFFFF00000000);
349 // Any MBZ bit of CR3 is set.
350 A((vmcb->save.cr3 & 0xFFF0000000000000) != 0);
352 // CR4[63:11] are not zero.
353 A((vmcb->save.cr4 & 0xFFFFFFFFFFFFF800) == 0xFFFFFFFFFFFFF800);
355 // DR6[63:32] are not zero.
356 A((vmcb->save.dr6 & 0xFFFFFFFF00000000) == 0xFFFFFFFF00000000);
358 // DR7[63:32] are not zero.
359 A((vmcb->save.dr7 & 0xFFFFFFFF00000000) == 0xFFFFFFFF00000000);
361 // EFER[63:15] are not zero.
362 A((vmcb->save.efer & 0xFFFFFFFFFFFF8000) == 0xFFFFFFFFFFF8000);
364 // EFER.LMA or EFER.LME is non-zero and this processor does not support long mode.
365 //// A((vmcb->save.efer & 0x0000000000000500) != 0);
367 // EFER.LME and CR0.PG are both set and CR4.PAE is zero.
368 A( ((vmcb->save.efer & 0x0000000000000100) != 0) &&
369 ((vmcb->save.cr0 & 0x0000000080000000) != 0) &&
370 ((vmcb->save.cr4 & 0x0000000000000020) != 0));
372 // EFER.LME and CR0.PG are both non-zero and CR0.PE is zero.
373 A( ((vmcb->save.efer & 0x0000000000000100) != 0) &&
374 ((vmcb->save.cr0 & 0x0000000080000000) != 0) &&
375 ((vmcb->save.cr0 & 0x0000000000000001) == 0));
377 // EFER.LME, CR0.PG, CR4.PAE, CS.L, and CS.D are all non-zero.
378 // cs.attrib = concat 55-52 and 47-40 (p372 v2)
379 A( ((vmcb->save.efer & 0x0000000000000100) != 0) &&
380 ((vmcb->save.cr0 & 0x0000000080000000) != 0) &&
381 ((vmcb->save.cr4 & 0x0000000000000020) != 0) &&
382 ((vmcb->save.cs.attrib & 0x0200) != 0) &&
383 ((vmcb->save.cs.attrib & 0x0400) != 0));
385 // The VMRUN intercept bit is clear.
386 A((vmcb->control.intercepts & 0x0000000100000000) == 0);
388 // The MSR or IOIO intercept tables extend to a physical address that is
389 // greater than or equal to the maximum supported physical address.
391 // Illegal event injection (see Section 15.19 on page 391).
393 // ASID is equal to zero.
394 A(vmcb->control.guest_asid == 0);
396 // VMRUN can load a guest value of CR0 with PE = 0 but PG = 1, a
397 // combination that is otherwise illegal (see Section 15.18).
399 // In addition to consistency checks, VMRUN and #VMEXIT canonicalize (i.e.,
400 // sign-extend to 63 bits) all base addresses in the segment registers
401 // that have been loaded.
403 return bad;
405 #undef A
408 static void
409 fkvm_vcpu_run(struct vcpu *vcpu, struct vmcb *vmcb)
411 u_int64_t lstar;
412 u_int64_t cstar;
413 u_int64_t star;
414 u_int64_t sfmask;
416 u_short fs_selector;
417 u_short gs_selector;
418 u_short ldt_selector;
420 unsigned long host_cr2;
421 unsigned long host_dr6;
422 unsigned long host_dr7;
424 struct system_segment_descriptor *tss_desc;
425 u_int64_t sel;
427 printf("begin fkvm_vcpu_run\n");
429 if (vmrun_assert(vmcb))
430 return;
433 tss_desc = (struct system_segment_descriptor*) (&gdt[GPROC0_SEL]);
434 sel = GSEL(GPROC0_SEL, SEL_KPL);
436 printf("GSEL(GPROC0_SEL, SEL_KPL)=0x%" PRIx64 "\n", sel);
437 print_tss_desc(tss_desc);
438 print_tss(tss_desc);
440 print_vmcb_save_area(vmcb);
441 // disable_intr();
443 vcpu->vmcb_pa = vtophys(vmcb);
444 printf("vmcb = 0x%p\n", vmcb);
445 printf("vcpu->vmcb_pa = 0x%lx\n", vcpu->vmcb_pa);
447 vmcb->save.rax = vcpu->regs[VCPU_REGS_RAX];
448 vmcb->save.rsp = vcpu->regs[VCPU_REGS_RSP];
449 vmcb->save.rip = vcpu->regs[VCPU_REGS_RIP];
451 /* meh: kvm has pre_svm_run(svm); */
453 vcpu->host_gs_base = rdmsr(MSR_GSBASE);
454 printf("host_gs_base: 0x%" PRIx64 "\n", vcpu->host_gs_base);
456 fs_selector = rfs();
457 gs_selector = rgs();
458 ldt_selector = rldt();
459 printf("fs selector: %hx\n", fs_selector);
460 printf("gs selector: %hx\n", gs_selector);
461 printf("ldt selector: %hx\n", ldt_selector);
463 host_cr2 = rcr2();
465 host_dr6 = rdr6();
466 host_dr7 = rdr7();
468 vmcb->save.cr2 = vcpu->cr2;
469 /* meh: cr3? */
471 /* meh: dr7? db_regs? */
473 printf("MSR_STAR: %" PRIx64 "\n", rdmsr(MSR_STAR));
474 printf("MSR_LSTAR: %" PRIx64 "\n", rdmsr(MSR_LSTAR));
475 printf("MSR_CSTAR: %" PRIx64 "\n", rdmsr(MSR_CSTAR));
476 printf("MSR_SF_MASK: %" PRIx64 "\n", rdmsr(MSR_SF_MASK));
478 star = rdmsr(MSR_STAR);
479 lstar = rdmsr(MSR_LSTAR);
480 cstar = rdmsr(MSR_CSTAR);
481 sfmask = rdmsr(MSR_SF_MASK);
483 printf("CLGI...\n");
485 __asm __volatile (SVM_CLGI);
488 // enable_intr();
490 __asm __volatile (
491 "push %%rbp; \n\t"
492 "mov %c[rbx](%[svm]), %%rbx \n\t"
493 "mov %c[rcx](%[svm]), %%rcx \n\t"
494 "mov %c[rdx](%[svm]), %%rdx \n\t"
495 "mov %c[rsi](%[svm]), %%rsi \n\t"
496 "mov %c[rdi](%[svm]), %%rdi \n\t"
497 "mov %c[rbp](%[svm]), %%rbp \n\t"
498 "mov %c[r8](%[svm]), %%r8 \n\t"
499 "mov %c[r9](%[svm]), %%r9 \n\t"
500 "mov %c[r10](%[svm]), %%r10 \n\t"
501 "mov %c[r11](%[svm]), %%r11 \n\t"
502 "mov %c[r12](%[svm]), %%r12 \n\t"
503 "mov %c[r13](%[svm]), %%r13 \n\t"
504 "mov %c[r14](%[svm]), %%r14 \n\t"
505 "mov %c[r15](%[svm]), %%r15 \n\t"
507 /* Enter guest mode */
508 "push %%rax \n\t"
509 "mov %c[vmcb](%[svm]), %%rax \n\t"
510 SVM_VMLOAD "\n\t"
511 SVM_VMRUN "\n\t"
512 SVM_VMSAVE "\n\t"
513 "pop %%rax \n\t"
515 /* Save guest registers, load host registers */
516 "mov %%rbx, %c[rbx](%[svm]) \n\t"
517 "mov %%rcx, %c[rcx](%[svm]) \n\t"
518 "mov %%rdx, %c[rdx](%[svm]) \n\t"
519 "mov %%rsi, %c[rsi](%[svm]) \n\t"
520 "mov %%rdi, %c[rdi](%[svm]) \n\t"
521 "mov %%rbp, %c[rbp](%[svm]) \n\t"
522 "mov %%r8, %c[r8](%[svm]) \n\t"
523 "mov %%r9, %c[r9](%[svm]) \n\t"
524 "mov %%r10, %c[r10](%[svm]) \n\t"
525 "mov %%r11, %c[r11](%[svm]) \n\t"
526 "mov %%r12, %c[r12](%[svm]) \n\t"
527 "mov %%r13, %c[r13](%[svm]) \n\t"
528 "mov %%r14, %c[r14](%[svm]) \n\t"
529 "mov %%r15, %c[r15](%[svm]) \n\t"
530 "pop %%rbp"
532 : [svm]"a"(vcpu),
533 [vmcb]"i"(offsetof(struct vcpu, vmcb_pa)),
534 [rbx]"i"(offsetof(struct vcpu, regs[VCPU_REGS_RBX])),
535 [rcx]"i"(offsetof(struct vcpu, regs[VCPU_REGS_RCX])),
536 [rdx]"i"(offsetof(struct vcpu, regs[VCPU_REGS_RDX])),
537 [rsi]"i"(offsetof(struct vcpu, regs[VCPU_REGS_RSI])),
538 [rdi]"i"(offsetof(struct vcpu, regs[VCPU_REGS_RDI])),
539 [rbp]"i"(offsetof(struct vcpu, regs[VCPU_REGS_RBP])),
540 [r8 ]"i"(offsetof(struct vcpu, regs[VCPU_REGS_R8 ])),
541 [r9 ]"i"(offsetof(struct vcpu, regs[VCPU_REGS_R9 ])),
542 [r10]"i"(offsetof(struct vcpu, regs[VCPU_REGS_R10])),
543 [r11]"i"(offsetof(struct vcpu, regs[VCPU_REGS_R11])),
544 [r12]"i"(offsetof(struct vcpu, regs[VCPU_REGS_R12])),
545 [r13]"i"(offsetof(struct vcpu, regs[VCPU_REGS_R13])),
546 [r14]"i"(offsetof(struct vcpu, regs[VCPU_REGS_R14])),
547 [r15]"i"(offsetof(struct vcpu, regs[VCPU_REGS_R15]))
548 : "cc", "memory",
549 "rbx", "rcx", "rdx", "rsi", "rdi",
550 "r8", "r9", "r10", "r11" , "r12", "r13", "r14", "r15"
554 /* meh: dr7? db_regs? */
556 vcpu->cr2 = vmcb->save.cr2;
558 vcpu->regs[VCPU_REGS_RAX] = vmcb->save.rax;
559 vcpu->regs[VCPU_REGS_RSP] = vmcb->save.rsp;
560 vcpu->regs[VCPU_REGS_RIP] = vmcb->save.rip;
562 load_dr6(host_dr6);
563 load_dr7(host_dr7);
565 load_cr2(host_cr2);
567 load_fs(fs_selector);
568 load_gs(gs_selector);
569 lldt(ldt_selector);
571 wrmsr(MSR_GSBASE, vcpu->host_gs_base);
573 tss_desc->sd_type = SDT_SYSTSS;
574 ltr(sel);
576 wrmsr(MSR_STAR, star);
577 wrmsr(MSR_LSTAR, lstar);
578 wrmsr(MSR_CSTAR, cstar);
579 wrmsr(MSR_SF_MASK, sfmask);
581 // disable_intr();
583 __asm __volatile (SVM_STGI);
585 printf("STGI\n");
587 printf("exit_code: %" PRIx64 "\n", vmcb->control.exit_code);
589 printf("MSR_STAR: %" PRIx64 "\n", rdmsr(MSR_STAR));
590 printf("MSR_LSTAR: %" PRIx64 "\n", rdmsr(MSR_LSTAR));
591 printf("MSR_CSTAR: %" PRIx64 "\n", rdmsr(MSR_CSTAR));
592 printf("MSR_SF_MASK: %" PRIx64 "\n", rdmsr(MSR_SF_MASK));
594 fs_selector = rfs();
595 gs_selector = rgs();
596 ldt_selector = rldt();
597 printf("fs selector: %hx\n", fs_selector);
598 printf("gs selector: %hx\n", gs_selector);
599 printf("ldt selector: %hx\n", ldt_selector);
601 vcpu->host_gs_base = rdmsr(MSR_GSBASE);
602 printf("host_gs_base: 0x%" PRIx64 "\n", vcpu->host_gs_base);
604 print_tss_desc(tss_desc);
605 print_tss(tss_desc);
607 print_vmcb_save_area(vmcb);
609 // enable_intr();
611 /* meh: next_rip */
614 static void
615 _fkvm_init_seg(struct vmcb_seg *seg, uint16_t attrib)
617 seg->selector = 0;
618 seg->attrib = VMCB_SELECTOR_P_MASK | attrib;
619 seg->limit = 0xffff;
620 seg->base = 0;
623 static inline void
624 fkvm_init_seg(struct vmcb_seg *seg)
626 _fkvm_init_seg(seg, VMCB_SELECTOR_S_MASK | VMCB_SELECTOR_WRITE_MASK);
629 static inline void
630 fkvm_init_sys_seg(struct vmcb_seg *seg, uint16_t attrib)
632 _fkvm_init_seg(seg, attrib);
635 static void
636 fkvm_iopm_init(void *iopm)
638 memset(iopm, 0xff, IOPM_SIZE); /* TODO: we may want to allow access to PC debug port */
641 static void
642 fkvm_msrpm_init(void *msrpm)
644 memset(msrpm, 0xff, MSRPM_SIZE); /* TODO: we may want to allow some MSR accesses */
648 static u_int64_t
649 fkvm_make_vm_map(struct vmspace **sp)
651 struct vmspace *_sp;
653 _sp = vmspace_alloc(0, 0xffffffffffffffff);
654 if (_sp == NULL) {
655 printf("vmspace_alloc failed\n");
656 goto fail;
659 *sp = _sp;
660 return vtophys(vmspace_pmap(_sp)->pm_pml4);
662 fail:
663 if (_sp != NULL) {
664 vmspace_free(_sp);
665 *sp = NULL;
667 return 0;
672 static void
673 fkvm_vmcb_init(struct vmcb *vmcb)
675 struct vmcb_control_area *control = &vmcb->control;
676 struct vmcb_save_area *save = &vmcb->save;
678 control->intercept_cr_reads = INTERCEPT_CR4_MASK;
680 control->intercept_cr_writes = INTERCEPT_CR4_MASK |
681 INTERCEPT_CR8_MASK;
683 control->intercept_dr_reads = INTERCEPT_DR0_MASK |
684 INTERCEPT_DR1_MASK |
685 INTERCEPT_DR2_MASK |
686 INTERCEPT_DR3_MASK;
688 control->intercept_dr_writes = INTERCEPT_DR0_MASK |
689 INTERCEPT_DR1_MASK |
690 INTERCEPT_DR2_MASK |
691 INTERCEPT_DR3_MASK |
692 INTERCEPT_DR5_MASK |
693 INTERCEPT_DR7_MASK;
695 control->intercept_exceptions = (1 << IDT_UD) | // Invalid Opcode
696 (1 << IDT_MC); // Machine Check
698 control->intercepts = INTERCEPT_INTR |
699 INTERCEPT_NMI |
700 INTERCEPT_SMI |
701 INTERCEPT_CPUID |
702 INTERCEPT_INVD |
703 INTERCEPT_HLT |
704 INTERCEPT_INVLPGA |
705 INTERCEPT_IOIO_PROT |
706 INTERCEPT_MSR_PROT |
707 INTERCEPT_SHUTDOWN |
708 INTERCEPT_VMRUN |
709 INTERCEPT_VMMCALL |
710 INTERCEPT_VMLOAD |
711 INTERCEPT_VMSAVE |
712 INTERCEPT_STGI |
713 INTERCEPT_CLGI |
714 INTERCEPT_SKINIT |
715 INTERCEPT_WBINVD |
716 INTERCEPT_MONITOR |
717 INTERCEPT_MWAIT_UNCOND;
719 control->iopm_base_pa = vtophys(iopm);
720 control->msrpm_base_pa = vtophys(msrpm);
721 control->tsc_offset = 0;
723 /* TODO: remove this once we assign asid's to distinct VM's */
724 control->guest_asid = 1;
725 control->tlb_control = VMCB_TLB_CONTROL_FLUSH_ALL;
727 /* let v_tpr default to 0 */
728 /* let v_irq default to 0 */
729 /* let v_intr default to 0 */
731 control->v_intr_masking = 1;
733 /* let v_intr_vector default to 0 */
734 /* let intr_shadow default to 0 */
735 /* let exit_code, exit_info_1, exit_info_2, exit_int_info,
736 exit_int_info_err_code default to 0 */
738 control->nested_ctl = 1;
740 /* let event_inj default to 0 */
742 // (nested_cr3 is later)
744 /* let lbr_virt_enable default to 0 */
747 fkvm_init_seg(&save->ds);
748 fkvm_init_seg(&save->es);
749 fkvm_init_seg(&save->fs);
750 fkvm_init_seg(&save->gs);
751 fkvm_init_seg(&save->ss);
753 _fkvm_init_seg(&save->cs, VMCB_SELECTOR_READ_MASK | VMCB_SELECTOR_S_MASK |
754 VMCB_SELECTOR_CODE_MASK);
755 save->cs.selector = 0xf000;
756 save->cs.base = 0xffff0000;
758 save->gdtr.limit = 0xffff;
759 save->idtr.limit = 0xffff;
761 fkvm_init_sys_seg(&save->ldtr, SDT_SYSLDT);
762 fkvm_init_sys_seg(&save->tr, SDT_SYS286BSY);
764 save->g_pat = PAT_VALUE(PAT_WRITE_BACK, 0) | PAT_VALUE(PAT_WRITE_THROUGH, 1) |
765 PAT_VALUE(PAT_UNCACHED, 2) | PAT_VALUE(PAT_UNCACHEABLE, 3) |
766 PAT_VALUE(PAT_WRITE_BACK, 4) | PAT_VALUE(PAT_WRITE_THROUGH, 5) |
767 PAT_VALUE(PAT_UNCACHED, 6) | PAT_VALUE(PAT_UNCACHEABLE, 7);
769 /* CR0 = 6000_0010h at boot */
770 save->cr0 = CR0_ET | CR0_NW | CR0_CD;
771 save->dr6 = 0xffff0ff0;
772 save->dr7 = 0x400;
773 save->rflags = 2;
774 save->rip = 0x0000fff0;
776 save->efer = EFER_SVME;
778 //control->nested_cr3 = fkvm_make_vm_map();
780 printf("ncr3: %" PRIx64 "\n", control->nested_cr3);
785 fkvm_userpoke(struct thread *td, struct fkvm_userpoke_args *uap)
787 printf("fkvm_userpoke\n");
788 return 1;
791 /* System Calls */
792 /* This function can only be called with multiples of page sizes */
794 fkvm_set_user_mem_region(struct thread *td, struct fkvm_set_user_mem_region_args *uap)
796 struct guestvm *guest_vm = GET_GUESTVM(td);
797 struct file *fp;
798 struct shmfd *shmfd;
799 vm_offset_t start;
800 vm_offset_t end;
801 int error;
803 error = fget(td, uap->fd, &fp);
804 if(error)
805 return error;
807 shmfd = fp->f_data;
808 printf("shm:\n");
809 printf(" size: %d bytes\n", (int) shmfd->shm_size);
810 printf(" vm object: %p\n", shmfd->shm_object);
811 printf(" size: %d pages\n", (int) shmfd->shm_object->size);
813 start = uap->guest_pa;
814 end = uap->guest_pa + uap->size;
815 printf("start: %d bytes\n", (int) start);
816 printf("end: %d bytes\n", (int) end);
818 vm_object_reference(shmfd->shm_object); // TODO: this might be a mem leak
819 error = vm_map_insert(&guest_vm->sp->vm_map,
820 shmfd->shm_object,
821 uap->vm_ooffset,
822 start,
823 end,
824 VM_PROT_ALL, VM_PROT_ALL,
826 if (error != KERN_SUCCESS) {
827 printf("vm_map_insert failed: %d\n", error);
828 return 1;
831 return 0;
835 fkvm_create_vm(struct thread *td, struct fkvm_create_vm_args *uap)
837 struct vcpu *vcpu;
838 struct guestvm *guest_vm;
840 printf("SYSCALL : fkvm_create_vm\n");
842 /* Allocate Guest VM */
843 guest_vm = (struct guestvm *)malloc(sizeof(struct guestvm), M_DEVBUF,
844 M_WAITOK);
845 if(guest_vm == NULL)
846 return 0;
848 /* TODO: Set up the vm address space */
850 /* Allocate VCPU0 */
851 vcpu = (struct vcpu*)malloc(sizeof(struct vcpu), M_DEVBUF, M_WAITOK);
852 if(vcpu == NULL)
853 goto errout0;
855 guest_vm->vcpus[0] = vcpu;
856 guest_vm->nr_vcpus = 1;
857 vcpu->guest_vm = guest_vm;
859 TD_SET_VCPU(td, vcpu);
861 /* Allocate VMCB */
862 vcpu->vmcb = (struct vmcb *)contigmalloc(PAGE_SIZE, M_DEVBUF, M_ZERO, 0, -1UL,
863 PAGE_SIZE, 0);
864 if(vcpu->vmcb == NULL)
865 goto errout1;
867 /* Initialize VMCB */
868 fkvm_vmcb_init(vcpu->vmcb);
870 /* TODO: Ugly */
871 vcpu->vmcb->control.nested_cr3 = fkvm_make_vm_map(&guest_vm->sp);
872 guest_vm->nested_cr3 = vcpu->vmcb->control.nested_cr3;
873 printf("fkvm_create_vm done. ncr3 %" PRIx64 "\n", guest_vm->nested_cr3);
874 return 1;
876 errout1:
877 contigfree(vcpu, PAGE_SIZE, M_DEVBUF);
878 errout0:
879 contigfree(guest_vm, PAGE_SIZE, M_DEVBUF);
881 return 0;
885 fkvm_destroy_vm(struct thread *td, struct fkvm_destroy_vm_args *uap)
887 struct guestvm *guest_vm = GET_GUESTVM(td);
888 int i;
890 /* Destroy the VCPUs */
891 for(i = 0; i < guest_vm->nr_vcpus; i++) {
892 if(guest_vm->vcpus[i] != NULL) {
893 if(guest_vm->vcpus[i]->vmcb != NULL)
894 contigfree(guest_vm->vcpus[i]->vmcb, PAGE_SIZE, M_DEVBUF);
895 contigfree(guest_vm->vcpus[i], PAGE_SIZE, M_DEVBUF);
899 /* Destroy the Guest VM itself */
900 contigfree(guest_vm, PAGE_SIZE, M_DEVBUF);
902 return 1;
906 fkvm_vm_run(struct thread *td, struct fkvm_vm_run_args *uap)
908 struct vcpu *vcpu = TD_GET_VCPU(td);
909 struct guestvm *guest_vm = GET_GUESTVM(td);
910 struct vmcb *vmcb = vcpu->vmcb;
912 fkvm_vcpu_run(vcpu, vmcb);
914 switch (vmcb->control.exit_code) {
916 case VMCB_EXIT_EXCP_BASE ... (VMCB_EXIT_EXCP_BASE + 31): {
917 int excp_vector;
919 excp_vector = vmcb->control.exit_code - VMCB_EXIT_EXCP_BASE;
921 printf("VMCB_EXIT_EXCP_BASE, exception vector: 0x%x\n",
922 excp_vector);
923 break;
926 case VMCB_EXIT_INTR: {
927 printf("VMCB_EXIT_INTR - nothing to do\n");
928 break;
931 case VMCB_EXIT_NPF: {
932 /* EXITINFO1 contains fault error code */
933 /* EXITINFO2 contains the guest physical address causing the fault. */
935 u_int64_t fault_code;
936 u_int64_t fault_gpa;
938 vm_prot_t fault_type;
939 int fault_flags;
940 int rc;
942 fault_code = vmcb->control.exit_info_1;
943 fault_gpa = vmcb->control.exit_info_2;
945 printf("VMCB_EXIT_NPF:\n");
946 printf("gpa=0x%" PRIx64 "\n", fault_gpa);
947 printf("fault code=0x%" PRIx64 " [P=%x, R/W=%x, U/S=%x, I/D=%x]\n",
948 fault_code,
949 (fault_code & PGEX_P) != 0,
950 (fault_code & PGEX_W) != 0,
951 (fault_code & PGEX_U) != 0,
952 (fault_code & PGEX_I) != 0);
954 if (fault_code & PGEX_W)
955 fault_type = VM_PROT_WRITE;
956 else if (fault_code & PGEX_I)
957 fault_type = VM_PROT_EXECUTE;
958 else
959 fault_type = VM_PROT_READ;
961 fault_flags = 0; /* TODO: is that right? */
962 rc = vm_fault(&guest_vm->sp->vm_map, fault_gpa, fault_type, fault_flags);
963 if (rc != KERN_SUCCESS)
964 printf("vm_fault failed: %d\n", rc);
965 break;
967 default:
968 printf("Unhandled vmexit:\n"
969 " code: 0x%" PRIx64 "\n"
970 " info1: 0x%" PRIx64 "\n"
971 " info2: 0x%" PRIx64 "\n",
972 vmcb->control.exit_code,
973 vmcb->control.exit_info_1,
974 vmcb->control.exit_info_2);
975 print_vmcb(vmcb);
978 return 1;
981 int
982 fkvm_create_vcpu(struct thread *td, struct fkvm_create_vcpu_args *uap)
984 struct guestvm *guest_vm = NULL; /* TODO: How to get this? */
985 struct vcpu *vcpu;
987 /* Allocate VCPU */
988 vcpu = (struct vcpu*)malloc(sizeof(struct vcpu), M_DEVBUF, M_WAITOK);
989 if(vcpu == NULL)
990 return 0;
992 guest_vm->nr_vcpus++; /* TODO: Probably not safe to increment */
993 /* How about a lock to protect all of this? */
995 guest_vm->vcpus[guest_vm->nr_vcpus] = vcpu;
996 vcpu->guest_vm = guest_vm;
998 td->vcpu = vcpu;
1000 /* Allocate VMCB */
1001 vcpu->vmcb = (struct vmcb *)contigmalloc(PAGE_SIZE, M_DEVBUF, M_ZERO, 0, -1UL,
1002 PAGE_SIZE, 0);
1003 if(vcpu->vmcb == NULL)
1004 goto errout0;
1006 /* Initialize VMCB */
1007 fkvm_vmcb_init(vcpu->vmcb);
1008 vcpu->vmcb->control.nested_cr3 = guest_vm->nested_cr3;
1010 return 1;
1012 errout0:
1013 contigfree(vcpu, PAGE_SIZE, M_DEVBUF);
1015 return 0;
1018 static void
1019 fkvm_load(void *unused)
1021 u_int64_t efer;
1023 printf("fkvm_load\n");
1024 printf("sizeof(struct vmcb) = %" PRIx64 "\n", sizeof(struct vmcb));
1026 /* TODO: check for the presense of extensions */
1028 hsave_area = contigmalloc(PAGE_SIZE, M_DEVBUF, 0, 0, -1UL,
1029 PAGE_SIZE, 0);
1030 if(hsave_area == NULL)
1031 return;
1033 iopm = contigmalloc(IOPM_SIZE, M_DEVBUF, 0, 0, -1UL, PAGE_SIZE, 0);
1034 if(iopm == NULL)
1035 goto errout0;
1037 msrpm = contigmalloc(MSRPM_SIZE, M_DEVBUF, 0, 0, -1UL, PAGE_SIZE, 0);
1038 if(msrpm == NULL)
1039 goto errout1;
1041 /* Initialize iopm and msrpm */
1042 fkvm_iopm_init(iopm);
1043 fkvm_msrpm_init(msrpm);
1045 /* Enable SVM in EFER */
1046 efer = rdmsr(MSR_EFER);
1047 printf("EFER = %" PRIx64 "\n", efer);
1048 wrmsr(MSR_EFER, efer | EFER_SVME);
1049 efer = rdmsr(MSR_EFER);
1050 printf("new EFER = %" PRIx64 "\n", efer);
1052 /* Write Host save address in MSR_VM_HSAVE_PA */
1053 wrmsr(MSR_VM_HSAVE_PA, vtophys(hsave_area));
1055 return;
1057 errout1:
1058 contigfree(iopm, IOPM_SIZE, M_DEVBUF);
1059 iopm = NULL;
1060 errout0:
1061 contigfree(hsave_area, PAGE_SIZE, M_DEVBUF);
1062 hsave_area = NULL;
1064 SYSINIT(fkvm, SI_SUB_PSEUDO, SI_ORDER_MIDDLE, fkvm_load, NULL);
1066 static void
1067 fkvm_unload(void *unused)
1069 printf("fkvm_unload\n");
1070 /* TODO */
1072 if(msrpm != NULL)
1073 contigfree(msrpm, MSRPM_SIZE, M_DEVBUF);
1075 if(iopm != NULL)
1076 contigfree(iopm, IOPM_SIZE, M_DEVBUF);
1078 if(hsave_area != NULL)
1079 contigfree(hsave_area, PAGE_SIZE, M_DEVBUF);
1081 SYSUNINIT(fkvm, SI_SUB_PSEUDO, SI_ORDER_MIDDLE, fkvm_unload, NULL);