From 11ba7f73d6e534d54da55d5c4a1ac1553cc62b45 Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Wed, 9 Aug 2017 22:20:52 -0700 Subject: [PATCH] kernel - Lower VM_MAX_USER_ADDRESS to finalize work-around for Ryzen bug * Reduce VM_MAX_USER_ADDRESS by 2MB, effectively making the top 2MB of the user address space unmappable. The user stack now starts 2MB down from where it did before. Theoretically we only need to reduce the top of the user address space by 4KB, but doing it by 2MB may be more useful for future page table optimizations. * As per AMD, Ryzen has an issue when the instruction pre-fetcher crosses from canonical to non-canonical address space. This can only occur at the top of the user stack. In DragonFlyBSD, the signal trampoline resides at the top of the user stack and an IRETQ into it can cause a Ryzen box to lockup and destabilize due to this action. The bug case was, basically two cpu threads on the same core, one in a cpu-bound loop of some sort while the other takes a normal UNIX signal (causing the IRETQ into the signal trampoline). The IRETQ microcode freezes until the cpu-bound loop terminates, preventing the cpu thread from being able to take any interrupt or IPI whatsoever for the duration, and the cpu may destabilize afterwords as well. * The pre-fetcher is somewhat heuristical, so just moving the trampoline down is no guarantee if the top 4KB of the user stack is mapped or mappable. It is better to make the boundary unmappable by userland. * Bug first tracked down by myself in early 2017. AMD validated the bug and determined that unmapping the boundary page completely solves the issue. * Also retain the code which places the signal trampoline in its own page so we can maintain separate protection settings for the code, and make it read-only (R+X). --- sys/cpu/x86_64/include/signal.h | 5 +++++ sys/kern/kern_exec.c | 10 ++-------- sys/platform/pc64/include/vmparam.h | 10 +++++++++- sys/platform/vkernel64/include/vmparam.h | 9 ++++++++- 4 files changed, 24 insertions(+), 10 deletions(-) diff --git a/sys/cpu/x86_64/include/signal.h b/sys/cpu/x86_64/include/signal.h index 777a1ae459..254fd82996 100644 --- a/sys/cpu/x86_64/include/signal.h +++ b/sys/cpu/x86_64/include/signal.h @@ -97,6 +97,11 @@ struct sigcontext { #endif /* __BSD_VISIBLE */ +/* + * Tells the kernel how many pages to drop the signal trampoline down + * when mapping it. Must be at least 1 page because kern_exec mprotects + * it to R+X (making it unwritable). + */ #if defined(_KERNEL) || defined(_KERNEL_STRUCTURES) #define SZSIGCODE_EXTRA_BYTES (1*PAGE_SIZE) #endif diff --git a/sys/kern/kern_exec.c b/sys/kern/kern_exec.c index fc20013527..1c5775995b 100644 --- a/sys/kern/kern_exec.c +++ b/sys/kern/kern_exec.c @@ -1109,18 +1109,12 @@ exec_copyout_strings(struct image_params *imgp) suword64((void *)vectp, 0); /* - * Make the signal trampoline executable. This also makes ps_strings - * executable but generally speaking there should be no direct access - * to ps_strings after the program has gotten to _main(). - * - * At the moment the space is writable because ps_strings needs to be - * writable. XXX future give sigtramp its own page (maybe put it in - * the kpmap). + * Make the signal trampoline executable and read-only. */ vm_map_protect(&imgp->proc->p_vmspace->vm_map, (vm_offset_t)szsigbase, (vm_offset_t)szsigbase + PAGE_SIZE, - VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE, FALSE); + VM_PROT_READ|VM_PROT_EXECUTE, FALSE); return (stack_base); } diff --git a/sys/platform/pc64/include/vmparam.h b/sys/platform/pc64/include/vmparam.h index 113fe32028..eeae0e71cb 100644 --- a/sys/platform/pc64/include/vmparam.h +++ b/sys/platform/pc64/include/vmparam.h @@ -123,8 +123,16 @@ #define UPT_MAX_ADDRESS KVADDR(PML4PML4I, PML4PML4I, PML4PML4I, PML4PML4I) #define UPT_MIN_ADDRESS KVADDR(PML4PML4I, 0, 0, 0) +/* + * Do not allow the top 2MB of the user stack to be mapped to work around + * an AMD bug wherein the cpu can lockup or destabilize if the instruction + * prefetcher crosses over into non-canonical address space. Only the top + * 4KB needs to be disallowed, but taking out the whole 2MB allows potential + * 2MB PTE optimizations to be retained. + */ #define VM_MIN_USER_ADDRESS ((vm_offset_t)0) -#define VM_MAX_USER_ADDRESS UVADDR(NUPDP_USER, 0, 0, 0) +#define VM_MAX_USER_ADDRESS UVADDR(NUPDP_USER - 1, NPDPEPG - 1, \ + NPDEPG - 1, 0) #define USRSTACK VM_MAX_USER_ADDRESS diff --git a/sys/platform/vkernel64/include/vmparam.h b/sys/platform/vkernel64/include/vmparam.h index 2ca9e3ccde..31a380489e 100644 --- a/sys/platform/vkernel64/include/vmparam.h +++ b/sys/platform/vkernel64/include/vmparam.h @@ -79,8 +79,15 @@ #define VKERNEL_USEABLE_PHYS_RAM_BASE (128ULL << 30) /* from 32GB */ +/* + * Do not allow the top 2MB of the user stack to be mapped to work around + * an AMD bug wherein the cpu can lockup or destabilize if the instruction + * prefetcher crosses over into non-canonical address space. Only the top + * 4KB needs to be disallowed, but taking out the whole 2MB allows potential + * 2MB PTE optimizations to be retained. + */ #define VM_MIN_USER_ADDRESS ((vm_offset_t)0) -#define VM_MAX_USER_ADDRESS UVADDR(NUPML4E, 0, 0, 0) +#define VM_MAX_USER_ADDRESS UVADDR(NUPML4E - 1, NPDPEPG - 1, NPDEPG - 1, 0) #define DMAP_SIZE (512UL * 1024 * 1024 * 1024) -- 2.11.4.GIT