kernel - Implement spectre mitigations part 1
[dragonfly.git] / sys / platform / pc64 / x86_64 / vm_machdep.c
blob1e4e557ac24e0ed21f93ceb90d59fd0c2c675f73
1 /*-
2 * Copyright (c) 1982, 1986 The Regents of the University of California.
3 * Copyright (c) 1989, 1990 William Jolitz
4 * Copyright (c) 1994 John Dyson
5 * Copyright (c) 2008-2018 The DragonFly Project.
6 * All rights reserved.
8 * This code is derived from software contributed to Berkeley by
9 * the Systems Programming Group of the University of Utah Computer
10 * Science Department, and William Jolitz.
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
14 * are met:
15 * 1. Redistributions of source code must retain the above copyright
16 * notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 * notice, this list of conditions and the following disclaimer in the
19 * documentation and/or other materials provided with the distribution.
20 * 3. All advertising materials mentioning features or use of this software
21 * must display the following acknowledgement:
22 * This product includes software developed by the University of
23 * California, Berkeley and its contributors.
24 * 4. Neither the name of the University nor the names of its contributors
25 * may be used to endorse or promote products derived from this software
26 * without specific prior written permission.
28 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
29 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
32 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
33 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
34 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
35 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
37 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38 * SUCH DAMAGE.
40 * from: @(#)vm_machdep.c 7.3 (Berkeley) 5/13/91
41 * Utah $Hdr: vm_machdep.c 1.16.1.1 89/06/23$
42 * $FreeBSD: src/sys/i386/i386/vm_machdep.c,v 1.132.2.9 2003/01/25 19:02:23 dillon Exp $
45 #include <sys/param.h>
46 #include <sys/systm.h>
47 #include <sys/malloc.h>
48 #include <sys/proc.h>
49 #include <sys/buf.h>
50 #include <sys/interrupt.h>
51 #include <sys/vnode.h>
52 #include <sys/vmmeter.h>
53 #include <sys/kernel.h>
54 #include <sys/sysctl.h>
55 #include <sys/unistd.h>
56 #include <sys/lwp.h>
58 #include <machine/clock.h>
59 #include <machine/cpu.h>
60 #include <machine/md_var.h>
61 #include <machine/smp.h>
62 #include <machine/pcb.h>
63 #include <machine/pcb_ext.h>
64 #include <machine/segments.h>
65 #include <machine/globaldata.h> /* npxthread */
66 #include <machine/vmm.h>
68 #include <vm/vm.h>
69 #include <vm/vm_param.h>
70 #include <sys/lock.h>
71 #include <vm/vm_kern.h>
72 #include <vm/vm_page.h>
73 #include <vm/vm_map.h>
74 #include <vm/vm_extern.h>
76 #include <sys/thread2.h>
77 #include <sys/mplock2.h>
79 #include <bus/isa/isa.h>
81 static void cpu_reset_real (void);
83 int spectre_mitigation = -1;
86 * Finish a fork operation, with lwp lp2 nearly set up.
87 * Copy and update the pcb, set up the stack so that the child
88 * ready to run and return to user mode.
90 void
91 cpu_fork(struct lwp *lp1, struct lwp *lp2, int flags)
93 struct pcb *pcb2;
94 struct pmap *pmap2;
96 if ((flags & RFPROC) == 0) {
97 if ((flags & RFMEM) == 0) {
99 * Unshare user LDT. > 1 test is MPSAFE. While
100 * it can potentially race a 2->1 transition, the
101 * worst that happens is that we do an unnecessary
102 * ldt replacement.
104 struct pcb *pcb1 = lp1->lwp_thread->td_pcb;
105 struct pcb_ldt *pcb_ldt = pcb1->pcb_ldt;
107 if (pcb_ldt && pcb_ldt->ldt_refcnt > 1) {
108 pcb_ldt = user_ldt_alloc(pcb1,pcb_ldt->ldt_len);
109 user_ldt_free(pcb1);
110 pcb1->pcb_ldt = pcb_ldt;
111 set_user_ldt(pcb1);
114 return;
117 /* Ensure that lp1's pcb is up to date. */
118 if (mdcpu->gd_npxthread == lp1->lwp_thread)
119 npxsave(lp1->lwp_thread->td_savefpu);
122 * Copy lp1's PCB. This really only applies to the
123 * debug registers and FP state, but its faster to just copy the
124 * whole thing. Because we only save the PCB at switchout time,
125 * the register state may not be current.
127 pcb2 = lp2->lwp_thread->td_pcb;
128 *pcb2 = *lp1->lwp_thread->td_pcb;
131 * Create a new fresh stack for the new process.
132 * Copy the trap frame for the return to user mode as if from a
133 * syscall. This copies the user mode register values.
135 * pcb_rsp must allocate an additional call-return pointer below
136 * the trap frame which will be restored by cpu_heavy_restore from
137 * PCB_RIP, and the thread's td_sp pointer must allocate an
138 * additonal two quadwords below the pcb_rsp call-return pointer to
139 * hold the LWKT restore function pointer and rflags.
141 * The LWKT restore function pointer must be set to cpu_heavy_restore,
142 * which is our standard heavy-weight process switch-in function.
143 * YYY eventually we should shortcut fork_return and fork_trampoline
144 * to use the LWKT restore function directly so we can get rid of
145 * all the extra crap we are setting up.
147 lp2->lwp_md.md_regs = (struct trapframe *)pcb2 - 1;
148 bcopy(lp1->lwp_md.md_regs, lp2->lwp_md.md_regs, sizeof(*lp2->lwp_md.md_regs));
151 * Set registers for trampoline to user mode. Leave space for the
152 * return address on stack. These are the kernel mode register values.
154 * Set the new pmap CR3. If the new process uses isolated VM spaces,
155 * also set the isolated CR3.
157 pmap2 = vmspace_pmap(lp2->lwp_proc->p_vmspace);
158 pcb2->pcb_cr3 = vtophys(pmap2->pm_pml4);
159 if ((pcb2->pcb_flags & PCB_ISOMMU) && pmap2->pm_pmlpv_iso) {
160 pcb2->pcb_cr3_iso = vtophys(pmap2->pm_pml4_iso);
161 } else {
162 pcb2->pcb_flags &= ~PCB_ISOMMU;
163 pcb2->pcb_cr3_iso = 0;
166 #if 0
168 * Per-process spectre mitigation (future)
170 pcb2->pcb_flags &= ~(PCB_IBRS1 | PCB_IBRS2);
171 switch (spectre_mitigation) {
172 case 1:
173 pcb2->pcb_flags |= PCB_IBRS1;
174 break;
175 case 2:
176 pcb2->pcb_flags |= PCB_IBRS2;
177 break;
178 default:
179 break;
181 #endif
183 pcb2->pcb_rbx = (unsigned long)fork_return; /* fork_trampoline argument */
184 pcb2->pcb_rbp = 0;
185 pcb2->pcb_rsp = (unsigned long)lp2->lwp_md.md_regs - sizeof(void *);
186 pcb2->pcb_r12 = (unsigned long)lp2; /* fork_trampoline argument */
187 pcb2->pcb_r13 = 0;
188 pcb2->pcb_r14 = 0;
189 pcb2->pcb_r15 = 0;
190 pcb2->pcb_rip = (unsigned long)fork_trampoline;
191 lp2->lwp_thread->td_sp = (char *)(pcb2->pcb_rsp - sizeof(void *));
192 *(u_int64_t *)lp2->lwp_thread->td_sp = PSL_USER;
193 lp2->lwp_thread->td_sp -= sizeof(void *);
194 *(void **)lp2->lwp_thread->td_sp = (void *)cpu_heavy_restore;
197 * pcb2->pcb_ldt: duplicated below, if necessary.
198 * pcb2->pcb_savefpu: cloned above.
199 * pcb2->pcb_flags: cloned above
200 * pcb2->pcb_onfault: cloned above (always NULL here).
201 * pcb2->pcb_onfault_sp:cloned above (dont care)
205 * XXX don't copy the i/o pages. this should probably be fixed.
207 pcb2->pcb_ext = NULL;
209 /* Copy the LDT, if necessary. */
210 if (pcb2->pcb_ldt != NULL) {
211 if (flags & RFMEM) {
212 atomic_add_int(&pcb2->pcb_ldt->ldt_refcnt, 1);
213 } else {
214 pcb2->pcb_ldt = user_ldt_alloc(pcb2,
215 pcb2->pcb_ldt->ldt_len);
218 bcopy(&lp1->lwp_thread->td_tls, &lp2->lwp_thread->td_tls,
219 sizeof(lp2->lwp_thread->td_tls));
221 * Now, cpu_switch() can schedule the new lwp.
222 * pcb_rsp is loaded pointing to the cpu_switch() stack frame
223 * containing the return address when exiting cpu_switch.
224 * This will normally be to fork_trampoline(), which will have
225 * %rbx loaded with the new lwp's pointer. fork_trampoline()
226 * will set up a stack to call fork_return(lp, frame); to complete
227 * the return to user-mode.
232 * Prepare new lwp to return to the address specified in params.
235 cpu_prepare_lwp(struct lwp *lp, struct lwp_params *params)
237 struct trapframe *regs = lp->lwp_md.md_regs;
238 void *bad_return = NULL;
239 int error;
241 regs->tf_rip = (long)params->lwp_func;
242 regs->tf_rsp = (long)params->lwp_stack;
243 /* Set up argument for function call */
244 regs->tf_rdi = (long)params->lwp_arg;
247 * Set up fake return address. As the lwp function may never return,
248 * we simply copy out a NULL pointer and force the lwp to receive
249 * a SIGSEGV if it returns anyways.
251 regs->tf_rsp -= sizeof(void *);
252 error = copyout(&bad_return, (void *)regs->tf_rsp, sizeof(bad_return));
253 if (error)
254 return (error);
256 if (lp->lwp_proc->p_vmm) {
257 lp->lwp_thread->td_pcb->pcb_cr3 = KPML4phys;
258 cpu_set_fork_handler(lp,
259 (void (*)(void *, struct trapframe *))vmm_lwp_return, lp);
260 } else {
261 cpu_set_fork_handler(lp,
262 (void (*)(void *, struct trapframe *))generic_lwp_return, lp);
264 return (0);
268 * Intercept the return address from a freshly forked process that has NOT
269 * been scheduled yet.
271 * This is needed to make kernel threads stay in kernel mode.
273 void
274 cpu_set_fork_handler(struct lwp *lp, void (*func)(void *, struct trapframe *),
275 void *arg)
278 * Note that the trap frame follows the args, so the function
279 * is really called like this: func(arg, frame);
281 lp->lwp_thread->td_pcb->pcb_rbx = (long)func; /* function */
282 lp->lwp_thread->td_pcb->pcb_r12 = (long)arg; /* first arg */
285 void
286 cpu_set_thread_handler(thread_t td, void (*rfunc)(void), void *func, void *arg)
288 td->td_pcb->pcb_rbx = (long)func;
289 td->td_pcb->pcb_r12 = (long)arg;
290 td->td_switch = cpu_lwkt_switch;
291 td->td_sp -= sizeof(void *);
292 *(void **)td->td_sp = rfunc; /* exit function on return */
293 td->td_sp -= sizeof(void *);
294 *(void **)td->td_sp = cpu_kthread_restore;
297 void
298 cpu_lwp_exit(void)
300 struct thread *td = curthread;
301 struct pcb *pcb;
303 pcb = td->td_pcb;
305 /* Some x86 functionality was dropped */
306 KKASSERT(pcb->pcb_ext == NULL);
309 * disable all hardware breakpoints
311 if (pcb->pcb_flags & PCB_DBREGS) {
312 reset_dbregs();
313 pcb->pcb_flags &= ~PCB_DBREGS;
315 td->td_gd->gd_cnt.v_swtch++;
317 crit_enter_quick(td);
318 if (td->td_flags & TDF_TSLEEPQ)
319 tsleep_remove(td);
320 lwkt_deschedule_self(td);
321 lwkt_remove_tdallq(td);
322 cpu_thread_exit();
326 * Terminate the current thread. The caller must have already acquired
327 * the thread's rwlock and placed it on a reap list or otherwise notified
328 * a reaper of its existance. We set a special assembly switch function which
329 * releases td_rwlock after it has cleaned up the MMU state and switched
330 * out the stack.
332 * Must be caller from a critical section and with the thread descheduled.
334 void
335 cpu_thread_exit(void)
337 npxexit();
338 curthread->td_switch = cpu_exit_switch;
339 curthread->td_flags |= TDF_EXITING;
340 lwkt_switch();
341 panic("cpu_thread_exit: lwkt_switch() unexpectedly returned");
344 void
345 cpu_reset(void)
347 cpu_reset_real();
350 static void
351 cpu_reset_real(void)
354 * Attempt to do a CPU reset via the keyboard controller,
355 * do not turn off the GateA20, as any machine that fails
356 * to do the reset here would then end up in no man's land.
359 #if !defined(BROKEN_KEYBOARD_RESET)
360 outb(IO_KBD + 4, 0xFE);
361 DELAY(500000); /* wait 0.5 sec to see if that did it */
362 kprintf("Keyboard reset did not work, attempting CPU shutdown\n");
363 DELAY(1000000); /* wait 1 sec for kprintf to complete */
364 #endif
365 #if 0 /* JG */
366 /* force a shutdown by unmapping entire address space ! */
367 bzero((caddr_t) PTD, PAGE_SIZE);
368 #endif
370 /* "good night, sweet prince .... <THUNK!>" */
371 cpu_invltlb();
372 /* NOTREACHED */
373 while(1);
377 * Convert kernel VA to physical address
379 vm_paddr_t
380 kvtop(void *addr)
382 vm_paddr_t pa;
384 pa = pmap_kextract((vm_offset_t)addr);
385 if (pa == 0)
386 panic("kvtop: zero page frame");
387 return (pa);
390 static void
391 swi_vm(void *arg, void *frame)
393 if (busdma_swi_pending != 0)
394 busdma_swi();
397 static void
398 swi_vm_setup(void *arg)
400 register_swi_mp(SWI_VM, swi_vm, NULL, "swi_vm", NULL, 0);
403 SYSINIT(swi_vm_setup, SI_BOOT2_MACHDEP, SI_ORDER_ANY, swi_vm_setup, NULL);
406 * NOTE: This routine is also called after a successful microcode
407 * reload on cpu 0.
409 void spectre_vm_setup(void *arg);
412 * Check for IBRS support
414 static
416 spectre_check_support(void)
418 uint32_t p[4];
420 p[0] = 0;
421 p[1] = 0;
422 p[2] = 0;
423 p[3] = 0;
424 cpuid_count(7, 0, p);
425 if ((p[3] & 0x0C000000U) == 0x0C000000U) {
428 * SPEC_CTRL (bit 26) and STIBP support (bit 27)
430 * 0x80000008 p[0] bit 12 indicates IBPB support
432 p[0] = 0;
433 p[1] = 0;
434 p[2] = 0;
435 p[3] = 0;
436 do_cpuid(0x80000008U, p);
437 if (p[0] & 0x00001000)
438 return 1;
440 return 0;
444 * Iterate CPUs and adjust MSR for global operations, since
445 * the KMMU* code won't do it if spectre_mitigation is 0 or 2.
447 static
448 void
449 spectre_sysctl_changed(int old_value)
451 globaldata_t save_gd;
452 int n;
454 save_gd = mycpu;
455 for (n = 0; n < ncpus; ++n) {
456 lwkt_setcpu_self(globaldata_find(n));
458 pscpu->trampoline.tr_pcb_gflags &= ~(PCB_IBRS1 | PCB_IBRS2);
460 switch(spectre_mitigation) {
461 case 0:
462 if (old_value >= 0)
463 wrmsr(0x48, 0);
464 break;
465 case 1:
466 pscpu->trampoline.tr_pcb_gflags |= PCB_IBRS1;
467 wrmsr(0x48, 1);
468 break;
469 case 2:
470 pscpu->trampoline.tr_pcb_gflags |= PCB_IBRS2;
471 wrmsr(0x48, 1);
472 break;
475 if (save_gd != mycpu)
476 lwkt_setcpu_self(save_gd);
480 * User changes sysctl value
482 static int
483 sysctl_spectre_mitigation(SYSCTL_HANDLER_ARGS)
485 int new_spectre;
486 int old_spectre;
487 int error;
489 old_spectre = spectre_mitigation;
490 new_spectre = old_spectre;
491 error = sysctl_handle_int(oidp, &new_spectre, 0, req);
492 if (error || req->newptr == NULL)
493 return error;
494 spectre_mitigation = new_spectre;
495 spectre_sysctl_changed(old_spectre);
497 return 0;
500 SYSCTL_PROC(_machdep, OID_AUTO, spectre_mitigation, CTLTYPE_INT | CTLFLAG_RW,
501 0, 0, sysctl_spectre_mitigation, "I", "Spectre exploit mitigation");
503 void
504 spectre_vm_setup(void *arg)
506 int inconsistent = 0;
507 int old_value = spectre_mitigation;
509 if (spectre_mitigation < 0) {
510 TUNABLE_INT_FETCH("machdep.spectre_mitigation",
511 &spectre_mitigation);
514 if (cpu_vendor_id == CPU_VENDOR_INTEL) {
515 if (spectre_check_support()) {
517 * Must be supported on all cpus before we
518 * can enable it. Returns silently if it
519 * isn't.
521 * NOTE! arg != NULL indicates we were called
522 * from cpuctl after a successful microcode
523 * update.
525 if (arg != NULL) {
526 globaldata_t save_gd;
527 int n;
529 save_gd = mycpu;
530 for (n = 0; n < ncpus; ++n) {
531 lwkt_setcpu_self(globaldata_find(n));
532 if (spectre_check_support() == 0) {
533 inconsistent = 1;
534 break;
537 if (save_gd != mycpu)
538 lwkt_setcpu_self(save_gd);
540 if (inconsistent == 0) {
541 if (spectre_mitigation < 0)
542 spectre_mitigation = 1;
543 } else {
544 spectre_mitigation = -1;
546 } else {
547 spectre_mitigation = -1;
549 } else {
550 spectre_mitigation = -1; /* no support */
554 * Be silent while microcode is being loaded on various CPUs,
555 * until all done.
557 if (inconsistent)
558 return;
561 * Disallow sysctl changes when there is no support (otherwise
562 * the wrmsr will cause a protection fault).
564 switch(spectre_mitigation) {
565 case 0:
566 sysctl___machdep_spectre_mitigation.oid_kind |= CTLFLAG_WR;
567 kprintf("machdep.spectre_mitigation available but disabled\n");
568 break;
569 case 1:
570 sysctl___machdep_spectre_mitigation.oid_kind |= CTLFLAG_WR;
571 kprintf("machdep.spectre_mitigation available, system call\n"
572 "performance and kernel operation will be impacted\n");
573 break;
574 case 2:
575 sysctl___machdep_spectre_mitigation.oid_kind |= CTLFLAG_WR;
576 kprintf("machdep.spectre_mitigation available, whole machine\n"
577 "performance will be impacted\n");
578 break;
579 default:
580 sysctl___machdep_spectre_mitigation.oid_kind &= ~CTLFLAG_WR;
581 if (cpu_vendor_id == CPU_VENDOR_INTEL)
582 kprintf("no microcode spectre mitigation available\n");
583 break;
585 spectre_sysctl_changed(old_value);
588 SYSINIT(spectre_vm_setup, SI_BOOT2_MACHDEP, SI_ORDER_ANY,
589 spectre_vm_setup, NULL);
592 * platform-specific vmspace initialization (nothing for x86_64)
594 void
595 cpu_vmspace_alloc(struct vmspace *vm __unused)
599 void
600 cpu_vmspace_free(struct vmspace *vm __unused)
605 kvm_access_check(vm_offset_t saddr, vm_offset_t eaddr, int prot)
607 vm_offset_t addr;
609 if (saddr < KvaStart)
610 return EFAULT;
611 if (eaddr >= KvaEnd)
612 return EFAULT;
613 for (addr = saddr; addr < eaddr; addr += PAGE_SIZE) {
614 if (pmap_kextract(addr) == 0)
615 return EFAULT;
617 if (!kernacc((caddr_t)saddr, eaddr - saddr, prot))
618 return EFAULT;
619 return 0;
622 #if 0
624 void _test_frame_enter(struct trapframe *frame);
625 void _test_frame_exit(struct trapframe *frame);
627 void
628 _test_frame_enter(struct trapframe *frame)
630 thread_t td = curthread;
632 if (ISPL(frame->tf_cs) == SEL_UPL) {
633 KKASSERT(td->td_lwp);
634 KASSERT(td->td_lwp->lwp_md.md_regs == frame,
635 ("_test_frame_exit: Frame mismatch %p %p",
636 td->td_lwp->lwp_md.md_regs, frame));
637 td->td_lwp->lwp_saveusp = (void *)frame->tf_rsp;
638 td->td_lwp->lwp_saveupc = (void *)frame->tf_rip;
640 if ((char *)frame < td->td_kstack ||
641 (char *)frame > td->td_kstack + td->td_kstack_size) {
642 panic("_test_frame_exit: frame not on kstack %p kstack=%p",
643 frame, td->td_kstack);
647 void
648 _test_frame_exit(struct trapframe *frame)
650 thread_t td = curthread;
652 if (ISPL(frame->tf_cs) == SEL_UPL) {
653 KKASSERT(td->td_lwp);
654 KASSERT(td->td_lwp->lwp_md.md_regs == frame,
655 ("_test_frame_exit: Frame mismatch %p %p",
656 td->td_lwp->lwp_md.md_regs, frame));
657 if (td->td_lwp->lwp_saveusp != (void *)frame->tf_rsp) {
658 kprintf("_test_frame_exit: %s:%d usp mismatch %p/%p\n",
659 td->td_comm, td->td_proc->p_pid,
660 td->td_lwp->lwp_saveusp,
661 (void *)frame->tf_rsp);
663 if (td->td_lwp->lwp_saveupc != (void *)frame->tf_rip) {
664 kprintf("_test_frame_exit: %s:%d upc mismatch %p/%p\n",
665 td->td_comm, td->td_proc->p_pid,
666 td->td_lwp->lwp_saveupc,
667 (void *)frame->tf_rip);
671 * adulterate the fields to catch entries that
672 * don't run through test_frame_enter
674 td->td_lwp->lwp_saveusp =
675 (void *)~(intptr_t)td->td_lwp->lwp_saveusp;
676 td->td_lwp->lwp_saveupc =
677 (void *)~(intptr_t)td->td_lwp->lwp_saveupc;
679 if ((char *)frame < td->td_kstack ||
680 (char *)frame > td->td_kstack + td->td_kstack_size) {
681 panic("_test_frame_exit: frame not on kstack %p kstack=%p",
682 frame, td->td_kstack);
686 #endif