From 4b5665564ef37dc939a3a9ffbafaab9894c18885 Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Sat, 16 Feb 2019 16:41:25 -0800 Subject: [PATCH] kernel - Implement sbrk(), change low-address mmap hinting * Change mmap()'s internal lower address bound from dmax (32GB) to RLIMIT_DATA's current value. This allows the rlimit to be e.g. reduced and for hinted mmap()s to then map space below the 4GB mark. The default data rlimit is 32GB. This change is needed to support several languages, at least lua and probably another one or two, who use mmap hinting under the assumption that it can map space below the 4GB address mark. The data limit must be lowered with a limit command too, which can be scripted or patched for such programs. * Implement the sbrk() system call. This system call was already present but just returned EOPNOTSUPP and libc previously had its own shim for sbrk() which used the ancient break() system call. (Note that the prior implementation did not ENOSYS or signal). sbrk() in the kernel is thread-safe for positive increments and is also byte-granular (the old libc sbrk() was only page-granular). sbrk() in the kernel does not implement negative increments and will return EOPNOTSUPP if asked to. Negative increments were historically designed to be able to 'free' memory allocated with sbrk(), but it is not possible to implement the case in a modern VM system due to the mmap changes above. (1) Because the new mmap hinting changes make it possible for normal mmap()s to have mapped space prior to the RLIMIT_DATA resource limit being increased, causing intermingling of sbrk() and user mmap()d regions. (2) because negative increments are not even remotely thread-safe. * Note the previous commit refactored libc to use the kernel sbrk() and fall-back to its previous emulation code on failure, so libc supports both new and old kernels. * Remove the brk() shim from libc. brk() is not implemented by the kernel. Symbol removed. Requires testing against ports so we may have to add it back in but basically there is no way to implement brk() properly with the mmap() hinting fix * Adjust manual pages. --- include/unistd.h | 2 +- lib/libc/sys/brk.2 | 50 +++++++++++++++---- sys/kern/imgact_aout.c | 4 +- sys/kern/imgact_elf.c | 8 ++-- sys/kern/imgact_gzip.c | 8 ++-- sys/kern/init_main.c | 2 +- sys/kern/kern_checkpoint.c | 4 +- sys/kern/kern_clock.c | 6 +-- sys/kern/kern_exec.c | 5 +- sys/kern/kern_fork.c | 16 +++---- sys/kern/kern_kinfo.c | 6 +-- sys/kern/syscalls.master | 4 +- sys/libkern/arc4random.c | 29 +++++++---- sys/sys/ckpt.h | 4 +- sys/sys/libkern.h | 1 + sys/sys/param.h | 5 +- sys/vm/vm_map.c | 38 ++++++++++----- sys/vm/vm_map.h | 6 +-- sys/vm/vm_mmap.c | 10 ---- sys/vm/vm_unix.c | 117 +++++++++++++++++++++++++++++++++++++++++++-- 20 files changed, 244 insertions(+), 81 deletions(-) diff --git a/include/unistd.h b/include/unistd.h index 37f235c10a..e9711998f6 100644 --- a/include/unistd.h +++ b/include/unistd.h @@ -507,7 +507,7 @@ void sync(void); #endif /* __XSI_VISIBLE */ #if __BSD_VISIBLE || (__XSI_VISIBLE && __XSI_VISIBLE < 600) -int brk(const void *); +/* int brk(const void *); no longer implemented */ int chroot(const char *); int chroot_kernel(const char *); int getdtablesize(void); diff --git a/lib/libc/sys/brk.2 b/lib/libc/sys/brk.2 index ef21b8c6f6..8339fa1e16 100644 --- a/lib/libc/sys/brk.2 +++ b/lib/libc/sys/brk.2 @@ -35,7 +35,7 @@ .Sh NAME .Nm brk , .Nm sbrk -.Nd change data segment size +.Nd change data segment size (obsolete) .Sh LIBRARY .Lb libc .Sh SYNOPSIS @@ -53,6 +53,14 @@ and .Fn sbrk functions are legacy interfaces from before the advent of modern virtual memory management. +.Fn brk +is no longer implemented by +.Dx +and +.Fn sbrk +has only limited functionality due to having to play nice with +modern system calls such as +.Xr mmap 2 . .Ef .Pp The @@ -67,25 +75,40 @@ The break is the first address after the end of the process's uninitialized data segment (also known as the .Dq BSS ) . .Pp +The break range is limited by the +.Dv RLIMIT_DATA +resource limit applied to the process. +.Pp The .Fn brk function sets the break to .Fa addr . +.Dx +no longer implements this function. .Pp The .Fn sbrk function raises the break by .Fa incr -bytes, thus allocating at least -.Fa incr -bytes of new memory in the data segment. -If -.Fa incr -is negative, -the break is lowered by +bytes, returning a pointer to the base of the new memory. +.Pp +In the traditional call, a negative .Fa incr -bytes. +lowers the break address by the specified number of bytes. +However, +.Dx +no longer supports using this legacy function to lower the break +address. The reason is because the resource limit can be adjusted +upward and downward at run-time and indirectly allow normal memory-mappings +via +.Fn mmap +to infiltrate the traditional data area. In addition, lowering the break +address in this manner is not thread safe. +Any attempt to lower the break point will return +.Po Vt "void *" Pc Ns \-1 +and set errno to +.Er EOPNOTSUPP . .Sh NOTES While the actual process data segment size maintained by the kernel will only grow or shrink in page sizes, these functions allow setting the break @@ -94,6 +117,10 @@ page of the data segment). .Pp The current value of the program break may be determined by calling .Fn sbrk 0 . +.Pp +The +.Fn sbrk +function is thread-safe. See also .Xr end 3 . .Pp @@ -118,7 +145,7 @@ for the definition of .Pp The .Fn sbrk -function returns the prior break value if successful; +function returns the prior break pointer if successful; otherwise the value .Po Vt "void *" Pc Ns \-1 is returned and the global variable @@ -139,6 +166,9 @@ was exceeded. .It Bq Er ENOMEM Insufficient space existed in the swap area to support the expansion of the data segment. +.It Bq Er EOPNOTSUPP +An attempt has been made to perform an action that is no longer supported +by this function. .El .Sh SEE ALSO .Xr execve 2 , diff --git a/sys/kern/imgact_aout.c b/sys/kern/imgact_aout.c index 509cee5da5..047e3d4238 100644 --- a/sys/kern/imgact_aout.c +++ b/sys/kern/imgact_aout.c @@ -233,8 +233,8 @@ exec_aout_imgact(struct image_params *imgp) vm_map_entry_release(count); /* Fill in process VM information */ - vmspace->vm_tsize = a_out->a_text >> PAGE_SHIFT; - vmspace->vm_dsize = (a_out->a_data + bss_size) >> PAGE_SHIFT; + vmspace->vm_tsize = a_out->a_text; /* in bytes */ + vmspace->vm_dsize = a_out->a_data + bss_size; /* in bytes */ vmspace->vm_taddr = (caddr_t) (uintptr_t) virtual_offset; vmspace->vm_daddr = (caddr_t) (uintptr_t) (virtual_offset + a_out->a_text); diff --git a/sys/kern/imgact_elf.c b/sys/kern/imgact_elf.c index d2d3cb9f6c..e7534942ff 100644 --- a/sys/kern/imgact_elf.c +++ b/sys/kern/imgact_elf.c @@ -816,9 +816,9 @@ __CONCAT(exec_,__elfN(imgact))(struct image_params *imgp) } } - vmspace->vm_tsize = text_size >> PAGE_SHIFT; + vmspace->vm_tsize = text_size; /* in bytes */ vmspace->vm_taddr = (caddr_t)(uintptr_t)text_addr; - vmspace->vm_dsize = data_size >> PAGE_SHIFT; + vmspace->vm_dsize = data_size; /* in bytes */ vmspace->vm_daddr = (caddr_t)(uintptr_t)data_addr; addr = ELF_RTLD_ADDR(vmspace); @@ -1659,8 +1659,8 @@ elf_puttextvp(struct proc *p, elf_buf_t target) vminfo = target_reserve(target, sizeof(struct ckpt_vminfo), &error); if (vminfo != NULL) { - vminfo->cvm_dsize = p->p_vmspace->vm_dsize; - vminfo->cvm_tsize = p->p_vmspace->vm_tsize; + vminfo->cvm_dsize = btoc(p->p_vmspace->vm_dsize); /* pages */ + vminfo->cvm_tsize = btoc(p->p_vmspace->vm_tsize); /* pages */ vminfo->cvm_daddr = p->p_vmspace->vm_daddr; vminfo->cvm_taddr = p->p_vmspace->vm_taddr; } diff --git a/sys/kern/imgact_gzip.c b/sys/kern/imgact_gzip.c index f93104e52d..60c9c3c3e7 100644 --- a/sys/kern/imgact_gzip.c +++ b/sys/kern/imgact_gzip.c @@ -126,8 +126,8 @@ exec_gzip_imgact(struct image_params *imgp) vmspace = imgp->proc->p_vmspace; error = vm_map_protect(&vmspace->vm_map, (vm_offset_t) vmspace->vm_taddr, - (vm_offset_t) (vmspace->vm_taddr + - (vmspace->vm_tsize << PAGE_SHIFT)) , + round_page((vm_offset_t) + (vmspace->vm_taddr + vmspace->vm_tsize)), VM_PROT_READ|VM_PROT_EXECUTE,0); } @@ -258,8 +258,8 @@ do_aout_hdr(struct imgact_gzip * gz) } } /* Fill in process VM information */ - vmspace->vm_tsize = gz->a_out.a_text >> PAGE_SHIFT; - vmspace->vm_dsize = (gz->a_out.a_data + gz->bss_size) >> PAGE_SHIFT; + vmspace->vm_tsize = gz->a_out.a_text; /* in bytes */ + vmspace->vm_dsize = gz->a_out.a_data + gz->bss_size; /* in bytes */ vmspace->vm_taddr = (caddr_t) (uintptr_t) gz->virtual_offset; vmspace->vm_daddr = (caddr_t) (uintptr_t) (gz->virtual_offset + gz->a_out.a_text); diff --git a/sys/kern/init_main.c b/sys/kern/init_main.c index dcb928ef6c..991c29eebb 100644 --- a/sys/kern/init_main.c +++ b/sys/kern/init_main.c @@ -591,7 +591,7 @@ start_init(void *dummy, struct trapframe *frame) if (error) panic("init: couldn't allocate argument space"); p->p_vmspace->vm_maxsaddr = (caddr_t)addr; - p->p_vmspace->vm_ssize = 1; + p->p_vmspace->vm_ssize = PAGE_SIZE; if ((var = kgetenv("init_path")) != NULL) { strncpy(init_path, var, sizeof init_path); diff --git a/sys/kern/kern_checkpoint.c b/sys/kern/kern_checkpoint.c index fac67ab1c2..68415b3006 100644 --- a/sys/kern/kern_checkpoint.c +++ b/sys/kern/kern_checkpoint.c @@ -556,9 +556,9 @@ elf_gettextvp(struct proc *p, struct file *fp) vmspace_exec(p, NULL); p->p_vmspace->vm_daddr = vminfo.cvm_daddr; - p->p_vmspace->vm_dsize = vminfo.cvm_dsize; + p->p_vmspace->vm_dsize = ctob(vminfo.cvm_dsize); /* in bytes */ p->p_vmspace->vm_taddr = vminfo.cvm_taddr; - p->p_vmspace->vm_tsize = vminfo.cvm_tsize; + p->p_vmspace->vm_tsize = ctob(vminfo.cvm_tsize); /* in bytes */ if ((error = read_check(fp, &vpcount, sizeof(int))) != 0) goto done; vnh = kmalloc(sizeof(struct vn_hdr) * vpcount, M_TEMP, M_WAITOK); diff --git a/sys/kern/kern_clock.c b/sys/kern/kern_clock.c index 89f9b4c134..dc937c52aa 100644 --- a/sys/kern/kern_clock.c +++ b/sys/kern/kern_clock.c @@ -1043,9 +1043,9 @@ schedclock(systimer_t info, int in_ipi __unused, struct intrframe *frame) */ if ((ru = &lp->lwp_proc->p_ru) && (vm = lp->lwp_proc->p_vmspace) != NULL) { - ru->ru_ixrss += pgtok(vm->vm_tsize); - ru->ru_idrss += pgtok(vm->vm_dsize); - ru->ru_isrss += pgtok(vm->vm_ssize); + ru->ru_ixrss += pgtok(btoc(vm->vm_tsize)); + ru->ru_idrss += pgtok(btoc(vm->vm_dsize)); + ru->ru_isrss += pgtok(btoc(vm->vm_ssize)); if (lwkt_trytoken(&vm->vm_map.token)) { rss = pgtok(vmspace_resident_count(vm)); if (ru->ru_maxrss < rss) diff --git a/sys/kern/kern_exec.c b/sys/kern/kern_exec.c index 95788db689..91c87f5fdb 100644 --- a/sys/kern/kern_exec.c +++ b/sys/kern/kern_exec.c @@ -855,11 +855,12 @@ exec_new_vmspace(struct image_params *imgp, struct vmspace *vmcopy) if (error) return (error); - /* vm_ssize and vm_maxsaddr are somewhat antiquated concepts in the + /* + * vm_ssize and vm_maxsaddr are somewhat antiquated concepts in the * VM_STACK case, but they are still used to monitor the size of the * process stack so we can check the stack rlimit. */ - vmspace->vm_ssize = sgrowsiz >> PAGE_SHIFT; + vmspace->vm_ssize = sgrowsiz; /* in bytes */ vmspace->vm_maxsaddr = (char *)USRSTACK - maxssiz; return(0); diff --git a/sys/kern/kern_fork.c b/sys/kern/kern_fork.c index 6acb67be2e..01b2777b14 100644 --- a/sys/kern/kern_fork.c +++ b/sys/kern/kern_fork.c @@ -674,20 +674,20 @@ fork1(struct lwp *lp1, int flags, struct proc **procp) if (flags == (RFFDG | RFPROC | RFPGLOCK)) { mycpu->gd_cnt.v_forks++; - mycpu->gd_cnt.v_forkpages += p2->p_vmspace->vm_dsize + - p2->p_vmspace->vm_ssize; + mycpu->gd_cnt.v_forkpages += btoc(p2->p_vmspace->vm_dsize) + + btoc(p2->p_vmspace->vm_ssize); } else if (flags == (RFFDG | RFPROC | RFPPWAIT | RFMEM | RFPGLOCK)) { mycpu->gd_cnt.v_vforks++; - mycpu->gd_cnt.v_vforkpages += p2->p_vmspace->vm_dsize + - p2->p_vmspace->vm_ssize; + mycpu->gd_cnt.v_vforkpages += btoc(p2->p_vmspace->vm_dsize) + + btoc(p2->p_vmspace->vm_ssize); } else if (p1 == &proc0) { mycpu->gd_cnt.v_kthreads++; - mycpu->gd_cnt.v_kthreadpages += p2->p_vmspace->vm_dsize + - p2->p_vmspace->vm_ssize; + mycpu->gd_cnt.v_kthreadpages += btoc(p2->p_vmspace->vm_dsize) + + btoc(p2->p_vmspace->vm_ssize); } else { mycpu->gd_cnt.v_rforks++; - mycpu->gd_cnt.v_rforkpages += p2->p_vmspace->vm_dsize + - p2->p_vmspace->vm_ssize; + mycpu->gd_cnt.v_rforkpages += btoc(p2->p_vmspace->vm_dsize) + + btoc(p2->p_vmspace->vm_ssize); } /* diff --git a/sys/kern/kern_kinfo.c b/sys/kern/kern_kinfo.c index 48661b5823..3eaa621325 100644 --- a/sys/kern/kern_kinfo.c +++ b/sys/kern/kern_kinfo.c @@ -154,9 +154,9 @@ fill_kinfo_proc(struct proc *p, struct kinfo_proc *kp) /*kp->kp_vm_prssize = vmspace_president_count(vm);*/ #endif kp->kp_vm_swrss = vm->vm_swrss; - kp->kp_vm_tsize = vm->vm_tsize; - kp->kp_vm_dsize = vm->vm_dsize; - kp->kp_vm_ssize = vm->vm_ssize; + kp->kp_vm_tsize = btoc(vm->vm_tsize); + kp->kp_vm_dsize = btoc(vm->vm_dsize); + kp->kp_vm_ssize = btoc(vm->vm_ssize); } if (p->p_ucred && jailed(p->p_ucred)) diff --git a/sys/kern/syscalls.master b/sys/kern/syscalls.master index 6ccb3933ef..1652d8e66c 100644 --- a/sys/kern/syscalls.master +++ b/sys/kern/syscalls.master @@ -120,8 +120,8 @@ 66 STD { pid_t vfork(void); } 67 OBSOL vread 68 OBSOL vwrite -69 STD { int sbrk(int incr); } -70 STD { int sstk(int incr); } +69 STD { caddr_t sbrk(size_t incr); } +70 STD { int sstk(size_t incr); } 71 OBSOL old mmap 72 OBSOL vadvise 73 STD { int munmap(void *addr, size_t len); } diff --git a/sys/libkern/arc4random.c b/sys/libkern/arc4random.c index 1aeb7ad7c9..f4282d6325 100644 --- a/sys/libkern/arc4random.c +++ b/sys/libkern/arc4random.c @@ -105,11 +105,6 @@ karc4random(void) struct arc4_data *d = arc4_data_pcpu[mycpuid]; uint32_t ret; -#if 0 - /* No one call this function in ISR/ithread. */ - crit_enter(); -#endif - if (++(d->arc4_numruns) > ARC4_MAXRUNS || time_uptime > d->arc4_nextreseed) arc4_randomstir(d); @@ -119,9 +114,27 @@ karc4random(void) ret |= arc4_randbyte(d) << 16; ret |= arc4_randbyte(d) << 24; -#if 0 - crit_exit(); -#endif + return ret; +} + +uint64_t +karc4random64(void) +{ + struct arc4_data *d = arc4_data_pcpu[mycpuid]; + uint64_t ret; + + if (++(d->arc4_numruns) > ARC4_MAXRUNS || + time_uptime > d->arc4_nextreseed) + arc4_randomstir(d); + + ret = arc4_randbyte(d); + ret |= arc4_randbyte(d) << 8; + ret |= arc4_randbyte(d) << 16; + ret |= arc4_randbyte(d) << 24; + ret |= (uint64_t)arc4_randbyte(d) << 32; + ret |= (uint64_t)arc4_randbyte(d) << 40; + ret |= (uint64_t)arc4_randbyte(d) << 48; + ret |= (uint64_t)arc4_randbyte(d) << 56; return ret; } diff --git a/sys/sys/ckpt.h b/sys/sys/ckpt.h index 1fe0f0c109..007500e809 100644 --- a/sys/sys/ckpt.h +++ b/sys/sys/ckpt.h @@ -55,8 +55,8 @@ struct ckpt_filehdr { }; struct ckpt_vminfo { - segsz_t cvm_dsize; - segsz_t cvm_tsize; + segsz_t cvm_dsize; /* in pages */ + segsz_t cvm_tsize; /* in pages */ segsz_t cvm_reserved1[4]; caddr_t cvm_daddr; caddr_t cvm_taddr; diff --git a/sys/sys/libkern.h b/sys/sys/libkern.h index ea329aedd6..0068bd3ff3 100644 --- a/sys/sys/libkern.h +++ b/sys/sys/libkern.h @@ -91,6 +91,7 @@ static __inline quad_t qabs(quad_t a) { return (a < 0 ? -a : a); } /* Prototypes for non-quad routines. */ uint32_t karc4random(void); +uint64_t karc4random64(void); void karc4rand(void *, size_t); void arc4_init_pcpu(int cpuid); int bcmp(const void *, const void *, size_t); diff --git a/sys/sys/param.h b/sys/sys/param.h index da1e0ac5c8..141b02acbd 100644 --- a/sys/sys/param.h +++ b/sys/sys/param.h @@ -192,9 +192,12 @@ * 500500 - 5.5 development * 500501 - reallocarray() added to libc * 500502 - puffs etc. removed + * 500503 - Lowered DATA rlimit supported by mmap(), libc sbrk() emulation + * had to be rewritten. libc brk() removed entirely. These changes + * are required to allow mmap hints to utilize lowered data rlimits. */ #undef __DragonFly_version -#define __DragonFly_version 500502 /* propagated to newvers */ +#define __DragonFly_version 500503 /* propagated to newvers */ #include diff --git a/sys/vm/vm_map.c b/sys/vm/vm_map.c index 188d5d8470..e3c12d296f 100644 --- a/sys/vm/vm_map.c +++ b/sys/vm/vm_map.c @@ -82,6 +82,7 @@ #include #include #include +#include #include #include @@ -3959,7 +3960,7 @@ Retry: /* If this is the main process stack, see if we're over the * stack limit. */ - if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > + if (is_procstack && (vm->vm_ssize + grow_amount > p->p_rlimit[RLIMIT_STACK].rlim_cur)) { rv = KERN_NO_SPACE; goto done; @@ -3970,10 +3971,9 @@ Retry: if (grow_amount > stack_entry->aux.avail_ssize) { grow_amount = stack_entry->aux.avail_ssize; } - if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > + if (is_procstack && (vm->vm_ssize + grow_amount > p->p_rlimit[RLIMIT_STACK].rlim_cur)) { - grow_amount = p->p_rlimit[RLIMIT_STACK].rlim_cur - - ctob(vm->vm_ssize); + grow_amount = p->p_rlimit[RLIMIT_STACK].rlim_cur - vm->vm_ssize; } /* If we would blow our VMEM resource limit, no go */ @@ -4017,9 +4017,10 @@ Retry: new_stack_entry->aux.avail_ssize = stack_entry->aux.avail_ssize - (new_stack_entry->end - new_stack_entry->start); - if (is_procstack) - vm->vm_ssize += btoc(new_stack_entry->end - - new_stack_entry->start); + if (is_procstack) { + vm->vm_ssize += new_stack_entry->end - + new_stack_entry->start; + } } if (map->flags & MAP_WIREFUTURE) @@ -4112,6 +4113,16 @@ vm_offset_t vm_map_hint(struct proc *p, vm_offset_t addr, vm_prot_t prot) { struct vmspace *vms = p->p_vmspace; + struct rlimit limit; + rlim_t dsiz; + + /* + * Acquire datasize limit for mmap() operation, + * calculate nearest power of 2. + */ + if (kern_getrlimit(RLIMIT_DATA, &limit)) + limit.rlim_cur = maxdsiz; + dsiz = limit.rlim_cur; if (!randomize_mmap || addr != 0) { /* @@ -4121,15 +4132,20 @@ vm_map_hint(struct proc *p, vm_offset_t addr, vm_prot_t prot) */ if (addr == 0 || (addr >= round_page((vm_offset_t)vms->vm_taddr) && - addr < round_page((vm_offset_t)vms->vm_daddr + maxdsiz))) { - addr = round_page((vm_offset_t)vms->vm_daddr + maxdsiz); + addr < round_page((vm_offset_t)vms->vm_daddr + dsiz))) { + addr = round_page((vm_offset_t)vms->vm_daddr + dsiz); } return addr; } - addr = (vm_offset_t)vms->vm_daddr + MAXDSIZ; - addr += karc4random() & (MIN((256 * 1024 * 1024), MAXDSIZ) - 1); + /* + * randomize_mmap && addr == 0. For now randomize the + * address within a dsiz range beyond the data limit. + */ + addr = (vm_offset_t)vms->vm_daddr + dsiz; + if (dsiz) + addr += (karc4random64() & 0x7FFFFFFFFFFFFFFFLU) % dsiz; return (round_page(addr)); } diff --git a/sys/vm/vm_map.h b/sys/vm/vm_map.h index 002afd8edb..80553160d3 100644 --- a/sys/vm/vm_map.h +++ b/sys/vm/vm_map.h @@ -344,9 +344,9 @@ struct vmspace { #define vm_startcopy vm_rssize segsz_t vm_rssize; /* current resident set size in pages */ segsz_t vm_swrss; /* resident set size before last swap */ - segsz_t vm_tsize; /* text size (pages) XXX */ - segsz_t vm_dsize; /* data size (pages) XXX */ - segsz_t vm_ssize; /* stack size (pages) */ + segsz_t vm_tsize; /* text size (bytes) */ + segsz_t vm_dsize; /* data size (bytes) */ + segsz_t vm_ssize; /* stack size (bytes) */ caddr_t vm_taddr; /* user virtual address of text XXX */ caddr_t vm_daddr; /* user virtual address of data XXX */ caddr_t vm_maxsaddr; /* user VA at max stack growth */ diff --git a/sys/vm/vm_mmap.c b/sys/vm/vm_mmap.c index 36559ebb89..decd43ece2 100644 --- a/sys/vm/vm_mmap.c +++ b/sys/vm/vm_mmap.c @@ -85,16 +85,6 @@ int vkernel_enable; SYSCTL_INT(_vm, OID_AUTO, vkernel_enable, CTLFLAG_RW, &vkernel_enable, 0, ""); /* - * MPSAFE - */ -int -sys_sbrk(struct sbrk_args *uap) -{ - /* Not yet implemented */ - return (EOPNOTSUPP); -} - -/* * sstk_args(int incr) * * MPSAFE diff --git a/sys/vm/vm_unix.c b/sys/vm/vm_unix.c index 9c2fe51700..5178784c90 100644 --- a/sys/vm/vm_unix.c +++ b/sys/vm/vm_unix.c @@ -57,7 +57,113 @@ #include /* - * sys_obreak backs the C library sbrk call + * sys_sbrk backs the C library sbrk call + * + * void *sbrk(intptr_t incr) + * + * No requirements. + */ +int +sys_sbrk(struct sbrk_args *uap) +{ + struct proc *p = curproc; + struct vmspace *vm = p->p_vmspace; + vm_offset_t nbase; + vm_offset_t base; + vm_offset_t base_end; + vm_offset_t incr; + int rv; + int error; + + error = 0; + + lwkt_gettoken(&vm->vm_map.token); + + /* + * Cannot assume that last data page for binary is mapped R/W. + */ + base = round_page((vm_offset_t)vm->vm_daddr) + vm->vm_dsize; + incr = uap->incr; + + /* + * Cannot allow space to be freed with sbrk() because it is not + * thread-safe for userland and because unrelated mmap()s can reside + * in the data space if the DATA rlimit is raised on the running + * program. + */ + if (incr < 0) { + error = EOPNOTSUPP; + goto done; + } + + /* + * Userland requests current base + */ + if (incr == 0) { + uap->sysmsg_resultp = (void *)base; + goto done; + } + + /* + * Calculate approximate area (vm_map_find() may change this). + * Check for overflow, address space, and rlimit caps. + */ + base_end = base + incr; + if (base_end >= VM_MAX_USER_ADDRESS) { + error = ENOMEM; + goto done; + } + if (base_end < base || + base_end - (vm_offset_t)vm->vm_daddr > + (vm_offset_t)p->p_rlimit[RLIMIT_DATA].rlim_cur) { + error = ENOMEM; + goto done; + } + + /* + * Same-page optimization (protected by token) + */ + if ((base & PAGE_MASK) != 0 && + ((base ^ (base_end - 1)) & ~(vm_offset_t)PAGE_MASK) == 0) { + uap->sysmsg_resultp = (void *)base; + vm->vm_dsize += incr; + goto done; + } + + /* + * Formally map more space + */ + nbase = round_page(base); + rv = vm_map_find(&vm->vm_map, NULL, NULL, + 0, &nbase, round_page(incr), + PAGE_SIZE, FALSE, + VM_MAPTYPE_NORMAL, VM_SUBSYS_BRK, + VM_PROT_ALL, VM_PROT_ALL, 0); + if (rv != KERN_SUCCESS) { + error = ENOMEM; + goto done; + } + base_end = nbase + round_page(incr); + uap->sysmsg_resultp = (void *)nbase; + if (vm->vm_map.flags & MAP_WIREFUTURE) + vm_map_wire(&vm->vm_map, base, base_end, FALSE); + + /* + * Adjust dsize upwards only + */ + incr = nbase + incr - round_page((vm_offset_t)vm->vm_daddr); + if (vm->vm_dsize < incr) + vm->vm_dsize = incr; + +done: + lwkt_reltoken(&vm->vm_map.token); + + return (error); +} + +/* + * sys_obreak is used by the sbrk emulation code in libc when sbrk() + * is not supported. * * obreak_args(char *nsize) * @@ -78,7 +184,7 @@ sys_obreak(struct obreak_args *uap) base = round_page((vm_offset_t)vm->vm_daddr); new = round_page((vm_offset_t)uap->nsize); - old = base + ctob(vm->vm_dsize); + old = base + round_page(vm->vm_dsize); if (new > base) { /* @@ -124,14 +230,17 @@ sys_obreak(struct obreak_args *uap) if (vm->vm_map.flags & MAP_WIREFUTURE) vm_map_wire(&vm->vm_map, old, new, FALSE); - vm->vm_dsize += btoc(diff); + vm->vm_dsize += diff; } else if (new < old) { + error = EOPNOTSUPP; +#if 0 rv = vm_map_remove(&vm->vm_map, new, old); if (rv != KERN_SUCCESS) { error = ENOMEM; goto done; } - vm->vm_dsize -= btoc(old - new); + vm->vm_dsize -= old - new; +#endif } done: lwkt_reltoken(&vm->vm_map.token); -- 2.11.4.GIT