2 * Copyright (c) 1988 University of Utah.
3 * Copyright (c) 1991, 1993
4 * The Regents of the University of California. All rights reserved.
6 * This code is derived from software contributed to Berkeley by
7 * the Systems Programming Group of the University of Utah Computer
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 4. Neither the name of the University nor the names of its contributors
19 * may be used to endorse or promote products derived from this software
20 * without specific prior written permission.
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$
36 * @(#)vm_mmap.c 8.4 (Berkeley) 1/12/94
40 * Mapped file (mmap) interface to VM
43 #include <sys/cdefs.h>
44 __FBSDID("$FreeBSD$");
46 #include "opt_compat.h"
47 #include "opt_hwpmc_hooks.h"
50 #include <sys/param.h>
51 #include <sys/systm.h>
52 #include <sys/capsicum.h>
53 #include <sys/kernel.h>
55 #include <sys/mutex.h>
56 #include <sys/sysproto.h>
57 #include <sys/filedesc.h>
60 #include <sys/procctl.h>
61 #include <sys/racct.h>
62 #include <sys/resource.h>
63 #include <sys/resourcevar.h>
64 #include <sys/rwlock.h>
65 #include <sys/sysctl.h>
66 #include <sys/vnode.h>
67 #include <sys/fcntl.h>
70 #include <sys/mount.h>
73 #include <sys/syscallsubr.h>
74 #include <sys/sysent.h>
75 #include <sys/vmmeter.h>
77 #include <security/audit/audit.h>
78 #include <security/mac/mac_framework.h>
81 #include <vm/vm_param.h>
83 #include <vm/vm_map.h>
84 #include <vm/vm_object.h>
85 #include <vm/vm_page.h>
86 #include <vm/vm_pager.h>
87 #include <vm/vm_pageout.h>
88 #include <vm/vm_extern.h>
89 #include <vm/vm_page.h>
90 #include <vm/vnode_pager.h>
93 #include <sys/pmckern.h>
97 SYSCTL_INT(_vm
, OID_AUTO
, old_mlock
, CTLFLAG_RWTUN
, &old_mlock
, 0,
98 "Do not apply RLIMIT_MEMLOCK on mlockall");
101 #define MAP_32BIT_MAX_ADDR ((vm_offset_t)1 << 31)
104 #ifndef _SYS_SYSPROTO_H_
117 struct sbrk_args
*uap
;
119 /* Not yet implemented */
123 #ifndef _SYS_SYSPROTO_H_
136 struct sstk_args
*uap
;
138 /* Not yet implemented */
142 #if defined(COMPAT_43)
143 #ifndef _SYS_SYSPROTO_H_
144 struct getpagesize_args
{
150 ogetpagesize(td
, uap
)
152 struct getpagesize_args
*uap
;
155 td
->td_retval
[0] = PAGE_SIZE
;
158 #endif /* COMPAT_43 */
162 * Memory Map (mmap) system call. Note that the file offset
163 * and address are allowed to be NOT page aligned, though if
164 * the MAP_FIXED flag it set, both must have the same remainder
165 * modulo the PAGE_SIZE (POSIX 1003.1b). If the address is not
166 * page-aligned, the actual mapping starts at trunc_page(addr)
167 * and the return value is adjusted up by the page offset.
169 * Generally speaking, only character devices which are themselves
170 * memory-based, such as a video framebuffer, can be mmap'd. Otherwise
171 * there would be no cache coherency between a descriptor and a VM mapping
172 * both to the same character device.
174 #ifndef _SYS_SYSPROTO_H_
192 struct mmap_args
*uap
;
196 vm_size_t size
, pageoff
;
197 vm_prot_t cap_maxprot
;
198 int align
, error
, flags
, prot
;
200 struct vmspace
*vms
= td
->td_proc
->p_vmspace
;
203 addr
= (vm_offset_t
) uap
->addr
;
210 AUDIT_ARG_FD(uap
->fd
);
213 * Ignore old flags that used to be defined but did not do anything.
215 flags
&= ~(MAP_RESERVED0020
| MAP_RESERVED0040
);
218 * Enforce the constraints.
219 * Mapping of length 0 is only allowed for old binaries.
220 * Anonymous mapping shall specify -1 as filedescriptor and
221 * zero position for new code. Be nice to ancient a.out
222 * binaries and correct pos for anonymous mapping, since old
223 * ld.so sometimes issues anonymous map requests with non-zero
226 if (!SV_CURPROC_FLAG(SV_AOUT
)) {
227 if ((uap
->len
== 0 && curproc
->p_osrel
>= P_OSREL_MAP_ANON
) ||
228 ((flags
& MAP_ANON
) != 0 && (uap
->fd
!= -1 || pos
!= 0)))
231 if ((flags
& MAP_ANON
) != 0)
235 if (flags
& MAP_STACK
) {
236 if ((uap
->fd
!= -1) ||
237 ((prot
& (PROT_READ
| PROT_WRITE
)) != (PROT_READ
| PROT_WRITE
)))
242 if ((flags
& ~(MAP_SHARED
| MAP_PRIVATE
| MAP_FIXED
| MAP_HASSEMAPHORE
|
243 MAP_STACK
| MAP_NOSYNC
| MAP_ANON
| MAP_EXCL
| MAP_NOCORE
|
248 MAP_ALIGNMENT_MASK
)) != 0)
250 if ((flags
& (MAP_EXCL
| MAP_FIXED
)) == MAP_EXCL
)
252 if ((flags
& (MAP_SHARED
| MAP_PRIVATE
)) == (MAP_SHARED
| MAP_PRIVATE
))
254 if (prot
!= PROT_NONE
&&
255 (prot
& ~(PROT_READ
| PROT_WRITE
| PROT_EXEC
)) != 0)
259 * Align the file position to a page boundary,
260 * and save its page offset component.
262 pageoff
= (pos
& PAGE_MASK
);
265 /* Adjust size for rounding (on both ends). */
266 size
+= pageoff
; /* low end... */
267 size
= (vm_size_t
) round_page(size
); /* hi end */
269 /* Ensure alignment is at least a page and fits in a pointer. */
270 align
= flags
& MAP_ALIGNMENT_MASK
;
271 if (align
!= 0 && align
!= MAP_ALIGNED_SUPER
&&
272 (align
>> MAP_ALIGNMENT_SHIFT
>= sizeof(void *) * NBBY
||
273 align
>> MAP_ALIGNMENT_SHIFT
< PAGE_SHIFT
))
277 * Check for illegal addresses. Watch out for address wrap... Note
278 * that VM_*_ADDRESS are not constants due to casts (argh).
280 if (flags
& MAP_FIXED
) {
282 * The specified address must have the same remainder
283 * as the file offset taken modulo PAGE_SIZE, so it
284 * should be aligned after adjustment by pageoff.
287 if (addr
& PAGE_MASK
)
290 /* Address range must be all in user VM space. */
291 if (addr
< vm_map_min(&vms
->vm_map
) ||
292 addr
+ size
> vm_map_max(&vms
->vm_map
))
294 if (addr
+ size
< addr
)
297 if (flags
& MAP_32BIT
&& addr
+ size
> MAP_32BIT_MAX_ADDR
)
299 } else if (flags
& MAP_32BIT
) {
301 * For MAP_32BIT, override the hint if it is too high and
302 * do not bother moving the mapping past the heap (since
303 * the heap is usually above 2GB).
305 if (addr
+ size
> MAP_32BIT_MAX_ADDR
)
310 * XXX for non-fixed mappings where no hint is provided or
311 * the hint would fall in the potential heap space,
312 * place it after the end of the largest possible heap.
314 * There should really be a pmap call to determine a reasonable
318 (addr
>= round_page((vm_offset_t
)vms
->vm_taddr
) &&
319 addr
< round_page((vm_offset_t
)vms
->vm_daddr
+
320 lim_max(td
, RLIMIT_DATA
))))
321 addr
= round_page((vm_offset_t
)vms
->vm_daddr
+
322 lim_max(td
, RLIMIT_DATA
));
326 * Return success without mapping anything for old
327 * binaries that request a page-aligned mapping of
328 * length 0. For modern binaries, this function
329 * returns an error earlier.
332 } else if (flags
& MAP_ANON
) {
334 * Mapping blank space is trivial.
336 * This relies on VM_PROT_* matching PROT_*.
338 error
= vm_mmap_object(&vms
->vm_map
, &addr
, size
, prot
,
339 VM_PROT_ALL
, flags
, NULL
, pos
, FALSE
, td
);
342 * Mapping file, get fp for validation and don't let the
343 * descriptor disappear on us if we block. Check capability
344 * rights, but also return the maximum rights to be combined
345 * with maxprot later.
347 cap_rights_init(&rights
, CAP_MMAP
);
348 if (prot
& PROT_READ
)
349 cap_rights_set(&rights
, CAP_MMAP_R
);
350 if ((flags
& MAP_SHARED
) != 0) {
351 if (prot
& PROT_WRITE
)
352 cap_rights_set(&rights
, CAP_MMAP_W
);
354 if (prot
& PROT_EXEC
)
355 cap_rights_set(&rights
, CAP_MMAP_X
);
356 error
= fget_mmap(td
, uap
->fd
, &rights
, &cap_maxprot
, &fp
);
359 if ((flags
& (MAP_SHARED
| MAP_PRIVATE
)) == 0 &&
360 td
->td_proc
->p_osrel
>= P_OSREL_MAP_FSTRICT
) {
365 /* This relies on VM_PROT_* matching PROT_*. */
366 error
= fo_mmap(fp
, &vms
->vm_map
, &addr
, size
, prot
,
367 cap_maxprot
, flags
, pos
, td
);
371 td
->td_retval
[0] = (register_t
) (addr
+ pageoff
);
379 #if defined(COMPAT_FREEBSD6)
381 freebsd6_mmap(struct thread
*td
, struct freebsd6_mmap_args
*uap
)
383 struct mmap_args oargs
;
385 oargs
.addr
= uap
->addr
;
386 oargs
.len
= uap
->len
;
387 oargs
.prot
= uap
->prot
;
388 oargs
.flags
= uap
->flags
;
390 oargs
.pos
= uap
->pos
;
391 return (sys_mmap(td
, &oargs
));
396 #ifndef _SYS_SYSPROTO_H_
409 struct ommap_args
*uap
;
411 struct mmap_args nargs
;
412 static const char cvtbsdprot
[8] = {
416 PROT_EXEC
| PROT_WRITE
,
418 PROT_EXEC
| PROT_READ
,
419 PROT_WRITE
| PROT_READ
,
420 PROT_EXEC
| PROT_WRITE
| PROT_READ
,
423 #define OMAP_ANON 0x0002
424 #define OMAP_COPY 0x0020
425 #define OMAP_SHARED 0x0010
426 #define OMAP_FIXED 0x0100
428 nargs
.addr
= uap
->addr
;
429 nargs
.len
= uap
->len
;
430 nargs
.prot
= cvtbsdprot
[uap
->prot
& 0x7];
431 #ifdef COMPAT_FREEBSD32
432 #if defined(__amd64__)
433 if (i386_read_exec
&& SV_PROC_FLAG(td
->td_proc
, SV_ILP32
) &&
435 nargs
.prot
|= PROT_EXEC
;
439 if (uap
->flags
& OMAP_ANON
)
440 nargs
.flags
|= MAP_ANON
;
441 if (uap
->flags
& OMAP_COPY
)
442 nargs
.flags
|= MAP_COPY
;
443 if (uap
->flags
& OMAP_SHARED
)
444 nargs
.flags
|= MAP_SHARED
;
446 nargs
.flags
|= MAP_PRIVATE
;
447 if (uap
->flags
& OMAP_FIXED
)
448 nargs
.flags
|= MAP_FIXED
;
450 nargs
.pos
= uap
->pos
;
451 return (sys_mmap(td
, &nargs
));
453 #endif /* COMPAT_43 */
456 #ifndef _SYS_SYSPROTO_H_
469 struct msync_args
*uap
;
472 vm_size_t size
, pageoff
;
477 addr
= (vm_offset_t
) uap
->addr
;
481 pageoff
= (addr
& PAGE_MASK
);
484 size
= (vm_size_t
) round_page(size
);
485 if (addr
+ size
< addr
)
488 if ((flags
& (MS_ASYNC
|MS_INVALIDATE
)) == (MS_ASYNC
|MS_INVALIDATE
))
491 map
= &td
->td_proc
->p_vmspace
->vm_map
;
494 * Clean the pages and interpret the return value.
496 rv
= vm_map_sync(map
, addr
, addr
+ size
, (flags
& MS_ASYNC
) == 0,
497 (flags
& MS_INVALIDATE
) != 0);
501 case KERN_INVALID_ADDRESS
:
503 case KERN_INVALID_ARGUMENT
:
512 #ifndef _SYS_SYSPROTO_H_
524 struct munmap_args
*uap
;
527 struct pmckern_map_out pkm
;
528 vm_map_entry_t entry
;
531 vm_size_t size
, pageoff
;
534 addr
= (vm_offset_t
) uap
->addr
;
539 pageoff
= (addr
& PAGE_MASK
);
542 size
= (vm_size_t
) round_page(size
);
543 if (addr
+ size
< addr
)
547 * Check for illegal addresses. Watch out for address wrap...
549 map
= &td
->td_proc
->p_vmspace
->vm_map
;
550 if (addr
< vm_map_min(map
) || addr
+ size
> vm_map_max(map
))
555 * Inform hwpmc if the address range being unmapped contains
556 * an executable region.
558 pkm
.pm_address
= (uintptr_t) NULL
;
559 if (vm_map_lookup_entry(map
, addr
, &entry
)) {
561 entry
!= &map
->header
&& entry
->start
< addr
+ size
;
562 entry
= entry
->next
) {
563 if (vm_map_check_protection(map
, entry
->start
,
564 entry
->end
, VM_PROT_EXECUTE
) == TRUE
) {
565 pkm
.pm_address
= (uintptr_t) addr
;
566 pkm
.pm_size
= (size_t) size
;
572 vm_map_delete(map
, addr
, addr
+ size
);
575 /* downgrade the lock to prevent a LOR with the pmc-sx lock */
576 vm_map_lock_downgrade(map
);
577 if (pkm
.pm_address
!= (uintptr_t) NULL
)
578 PMC_CALL_HOOK(td
, PMC_FN_MUNMAP
, (void *) &pkm
);
579 vm_map_unlock_read(map
);
583 /* vm_map_delete returns nothing but KERN_SUCCESS anyway */
587 #ifndef _SYS_SYSPROTO_H_
588 struct mprotect_args
{
598 sys_mprotect(td
, uap
)
600 struct mprotect_args
*uap
;
603 vm_size_t size
, pageoff
;
606 addr
= (vm_offset_t
) uap
->addr
;
608 prot
= uap
->prot
& VM_PROT_ALL
;
610 pageoff
= (addr
& PAGE_MASK
);
613 size
= (vm_size_t
) round_page(size
);
614 if (addr
+ size
< addr
)
617 switch (vm_map_protect(&td
->td_proc
->p_vmspace
->vm_map
, addr
,
618 addr
+ size
, prot
, FALSE
)) {
621 case KERN_PROTECTION_FAILURE
:
623 case KERN_RESOURCE_SHORTAGE
:
629 #ifndef _SYS_SYSPROTO_H_
630 struct minherit_args
{
640 sys_minherit(td
, uap
)
642 struct minherit_args
*uap
;
645 vm_size_t size
, pageoff
;
646 vm_inherit_t inherit
;
648 addr
= (vm_offset_t
)uap
->addr
;
650 inherit
= uap
->inherit
;
652 pageoff
= (addr
& PAGE_MASK
);
655 size
= (vm_size_t
) round_page(size
);
656 if (addr
+ size
< addr
)
659 switch (vm_map_inherit(&td
->td_proc
->p_vmspace
->vm_map
, addr
,
660 addr
+ size
, inherit
)) {
663 case KERN_PROTECTION_FAILURE
:
669 #ifndef _SYS_SYSPROTO_H_
670 struct madvise_args
{
683 struct madvise_args
*uap
;
685 vm_offset_t start
, end
;
690 * Check for our special case, advising the swap pager we are
693 if (uap
->behav
== MADV_PROTECT
) {
695 return (kern_procctl(td
, P_PID
, td
->td_proc
->p_pid
,
696 PROC_SPROTECT
, &flags
));
700 * Check for illegal behavior
702 if (uap
->behav
< 0 || uap
->behav
> MADV_CORE
)
705 * Check for illegal addresses. Watch out for address wrap... Note
706 * that VM_*_ADDRESS are not constants due to casts (argh).
708 map
= &td
->td_proc
->p_vmspace
->vm_map
;
709 if ((vm_offset_t
)uap
->addr
< vm_map_min(map
) ||
710 (vm_offset_t
)uap
->addr
+ uap
->len
> vm_map_max(map
))
712 if (((vm_offset_t
) uap
->addr
+ uap
->len
) < (vm_offset_t
) uap
->addr
)
716 * Since this routine is only advisory, we default to conservative
719 start
= trunc_page((vm_offset_t
) uap
->addr
);
720 end
= round_page((vm_offset_t
) uap
->addr
+ uap
->len
);
722 if (vm_map_madvise(map
, start
, end
, uap
->behav
))
727 #ifndef _SYS_SYSPROTO_H_
728 struct mincore_args
{
741 struct mincore_args
*uap
;
743 vm_offset_t addr
, first_addr
;
744 vm_offset_t end
, cend
;
749 int vecindex
, lastvecindex
;
750 vm_map_entry_t current
;
751 vm_map_entry_t entry
;
753 vm_paddr_t locked_pa
;
757 unsigned int timestamp
;
761 * Make sure that the addresses presented are valid for user
764 first_addr
= addr
= trunc_page((vm_offset_t
) uap
->addr
);
765 end
= addr
+ (vm_size_t
)round_page(uap
->len
);
766 map
= &td
->td_proc
->p_vmspace
->vm_map
;
767 if (end
> vm_map_max(map
) || end
< addr
)
771 * Address of byte vector
775 pmap
= vmspace_pmap(td
->td_proc
->p_vmspace
);
777 vm_map_lock_read(map
);
779 timestamp
= map
->timestamp
;
781 if (!vm_map_lookup_entry(map
, addr
, &entry
)) {
782 vm_map_unlock_read(map
);
787 * Do this on a map entry basis so that if the pages are not
788 * in the current processes address space, we can easily look
789 * up the pages elsewhere.
792 for (current
= entry
;
793 (current
!= &map
->header
) && (current
->start
< end
);
794 current
= current
->next
) {
797 * check for contiguity
799 if (current
->end
< end
&&
800 (entry
->next
== &map
->header
||
801 current
->next
->start
> current
->end
)) {
802 vm_map_unlock_read(map
);
807 * ignore submaps (for now) or null objects
809 if ((current
->eflags
& MAP_ENTRY_IS_SUB_MAP
) ||
810 current
->object
.vm_object
== NULL
)
814 * limit this scan to the current map entry and the
815 * limits for the mincore call
817 if (addr
< current
->start
)
818 addr
= current
->start
;
824 * scan this entry one page at a time
826 while (addr
< cend
) {
828 * Check pmap first, it is likely faster, also
829 * it can provide info as to whether we are the
830 * one referencing or modifying the page.
836 mincoreinfo
= pmap_mincore(pmap
, addr
, &locked_pa
);
837 if (locked_pa
!= 0) {
839 * The page is mapped by this process but not
840 * both accessed and modified. It is also
841 * managed. Acquire the object lock so that
842 * other mappings might be examined.
844 m
= PHYS_TO_VM_PAGE(locked_pa
);
845 if (m
->object
!= object
) {
847 VM_OBJECT_WUNLOCK(object
);
849 locked
= VM_OBJECT_TRYWLOCK(object
);
852 VM_OBJECT_WLOCK(object
);
858 KASSERT(m
->valid
== VM_PAGE_BITS_ALL
,
859 ("mincore: page %p is mapped but invalid",
861 } else if (mincoreinfo
== 0) {
863 * The page is not mapped by this process. If
864 * the object implements managed pages, then
865 * determine if the page is resident so that
866 * the mappings might be examined.
868 if (current
->object
.vm_object
!= object
) {
870 VM_OBJECT_WUNLOCK(object
);
871 object
= current
->object
.vm_object
;
872 VM_OBJECT_WLOCK(object
);
874 if (object
->type
== OBJT_DEFAULT
||
875 object
->type
== OBJT_SWAP
||
876 object
->type
== OBJT_VNODE
) {
877 pindex
= OFF_TO_IDX(current
->offset
+
878 (addr
- current
->start
));
879 m
= vm_page_lookup(object
, pindex
);
881 vm_page_is_cached(object
, pindex
))
882 mincoreinfo
= MINCORE_INCORE
;
883 if (m
!= NULL
&& m
->valid
== 0)
886 mincoreinfo
= MINCORE_INCORE
;
890 /* Examine other mappings to the page. */
891 if (m
->dirty
== 0 && pmap_is_modified(m
))
894 mincoreinfo
|= MINCORE_MODIFIED_OTHER
;
896 * The first test for PGA_REFERENCED is an
897 * optimization. The second test is
898 * required because a concurrent pmap
899 * operation could clear the last reference
900 * and set PGA_REFERENCED before the call to
901 * pmap_is_referenced().
903 if ((m
->aflags
& PGA_REFERENCED
) != 0 ||
904 pmap_is_referenced(m
) ||
905 (m
->aflags
& PGA_REFERENCED
) != 0)
906 mincoreinfo
|= MINCORE_REFERENCED_OTHER
;
909 VM_OBJECT_WUNLOCK(object
);
912 * subyte may page fault. In case it needs to modify
913 * the map, we release the lock.
915 vm_map_unlock_read(map
);
918 * calculate index into user supplied byte vector
920 vecindex
= OFF_TO_IDX(addr
- first_addr
);
923 * If we have skipped map entries, we need to make sure that
924 * the byte vector is zeroed for those skipped entries.
926 while ((lastvecindex
+ 1) < vecindex
) {
928 error
= subyte(vec
+ lastvecindex
, 0);
936 * Pass the page information to the user
938 error
= subyte(vec
+ vecindex
, mincoreinfo
);
945 * If the map has changed, due to the subyte, the previous
946 * output may be invalid.
948 vm_map_lock_read(map
);
949 if (timestamp
!= map
->timestamp
)
952 lastvecindex
= vecindex
;
958 * subyte may page fault. In case it needs to modify
959 * the map, we release the lock.
961 vm_map_unlock_read(map
);
964 * Zero the last entries in the byte vector.
966 vecindex
= OFF_TO_IDX(end
- first_addr
);
967 while ((lastvecindex
+ 1) < vecindex
) {
969 error
= subyte(vec
+ lastvecindex
, 0);
977 * If the map has changed, due to the subyte, the previous
978 * output may be invalid.
980 vm_map_lock_read(map
);
981 if (timestamp
!= map
->timestamp
)
983 vm_map_unlock_read(map
);
988 #ifndef _SYS_SYSPROTO_H_
1000 struct mlock_args
*uap
;
1003 return (vm_mlock(td
->td_proc
, td
->td_ucred
, uap
->addr
, uap
->len
));
1007 vm_mlock(struct proc
*proc
, struct ucred
*cred
, const void *addr0
, size_t len
)
1009 vm_offset_t addr
, end
, last
, start
;
1010 vm_size_t npages
, size
;
1012 unsigned long nsize
;
1015 error
= priv_check_cred(cred
, PRIV_VM_MLOCK
, 0);
1018 addr
= (vm_offset_t
)addr0
;
1021 start
= trunc_page(addr
);
1022 end
= round_page(last
);
1023 if (last
< addr
|| end
< addr
)
1025 npages
= atop(end
- start
);
1026 if (npages
> vm_page_max_wired
)
1028 map
= &proc
->p_vmspace
->vm_map
;
1030 nsize
= ptoa(npages
+ pmap_wired_count(map
->pmap
));
1031 if (nsize
> lim_cur_proc(proc
, RLIMIT_MEMLOCK
)) {
1036 if (npages
+ vm_cnt
.v_wire_count
> vm_page_max_wired
)
1041 error
= racct_set(proc
, RACCT_MEMLOCK
, nsize
);
1047 error
= vm_map_wire(map
, start
, end
,
1048 VM_MAP_WIRE_USER
| VM_MAP_WIRE_NOHOLES
);
1050 if (racct_enable
&& error
!= KERN_SUCCESS
) {
1052 racct_set(proc
, RACCT_MEMLOCK
,
1053 ptoa(pmap_wired_count(map
->pmap
)));
1057 return (error
== KERN_SUCCESS
? 0 : ENOMEM
);
1060 #ifndef _SYS_SYSPROTO_H_
1061 struct mlockall_args
{
1070 sys_mlockall(td
, uap
)
1072 struct mlockall_args
*uap
;
1077 map
= &td
->td_proc
->p_vmspace
->vm_map
;
1078 error
= priv_check(td
, PRIV_VM_MLOCK
);
1082 if ((uap
->how
== 0) || ((uap
->how
& ~(MCL_CURRENT
|MCL_FUTURE
)) != 0))
1086 * If wiring all pages in the process would cause it to exceed
1087 * a hard resource limit, return ENOMEM.
1089 if (!old_mlock
&& uap
->how
& MCL_CURRENT
) {
1090 PROC_LOCK(td
->td_proc
);
1091 if (map
->size
> lim_cur(td
, RLIMIT_MEMLOCK
)) {
1092 PROC_UNLOCK(td
->td_proc
);
1095 PROC_UNLOCK(td
->td_proc
);
1099 PROC_LOCK(td
->td_proc
);
1100 error
= racct_set(td
->td_proc
, RACCT_MEMLOCK
, map
->size
);
1101 PROC_UNLOCK(td
->td_proc
);
1107 if (uap
->how
& MCL_FUTURE
) {
1109 vm_map_modflags(map
, MAP_WIREFUTURE
, 0);
1114 if (uap
->how
& MCL_CURRENT
) {
1116 * P1003.1-2001 mandates that all currently mapped pages
1117 * will be memory resident and locked (wired) upon return
1118 * from mlockall(). vm_map_wire() will wire pages, by
1119 * calling vm_fault_wire() for each page in the region.
1121 error
= vm_map_wire(map
, vm_map_min(map
), vm_map_max(map
),
1122 VM_MAP_WIRE_USER
|VM_MAP_WIRE_HOLESOK
);
1123 error
= (error
== KERN_SUCCESS
? 0 : EAGAIN
);
1126 if (racct_enable
&& error
!= KERN_SUCCESS
) {
1127 PROC_LOCK(td
->td_proc
);
1128 racct_set(td
->td_proc
, RACCT_MEMLOCK
,
1129 ptoa(pmap_wired_count(map
->pmap
)));
1130 PROC_UNLOCK(td
->td_proc
);
1137 #ifndef _SYS_SYSPROTO_H_
1138 struct munlockall_args
{
1147 sys_munlockall(td
, uap
)
1149 struct munlockall_args
*uap
;
1154 map
= &td
->td_proc
->p_vmspace
->vm_map
;
1155 error
= priv_check(td
, PRIV_VM_MUNLOCK
);
1159 /* Clear the MAP_WIREFUTURE flag from this vm_map. */
1161 vm_map_modflags(map
, 0, MAP_WIREFUTURE
);
1164 /* Forcibly unwire all pages. */
1165 error
= vm_map_unwire(map
, vm_map_min(map
), vm_map_max(map
),
1166 VM_MAP_WIRE_USER
|VM_MAP_WIRE_HOLESOK
);
1168 if (racct_enable
&& error
== KERN_SUCCESS
) {
1169 PROC_LOCK(td
->td_proc
);
1170 racct_set(td
->td_proc
, RACCT_MEMLOCK
, 0);
1171 PROC_UNLOCK(td
->td_proc
);
1178 #ifndef _SYS_SYSPROTO_H_
1179 struct munlock_args
{
1188 sys_munlock(td
, uap
)
1190 struct munlock_args
*uap
;
1192 vm_offset_t addr
, end
, last
, start
;
1199 error
= priv_check(td
, PRIV_VM_MUNLOCK
);
1202 addr
= (vm_offset_t
)uap
->addr
;
1205 start
= trunc_page(addr
);
1206 end
= round_page(last
);
1207 if (last
< addr
|| end
< addr
)
1209 error
= vm_map_unwire(&td
->td_proc
->p_vmspace
->vm_map
, start
, end
,
1210 VM_MAP_WIRE_USER
| VM_MAP_WIRE_NOHOLES
);
1212 if (racct_enable
&& error
== KERN_SUCCESS
) {
1213 PROC_LOCK(td
->td_proc
);
1214 map
= &td
->td_proc
->p_vmspace
->vm_map
;
1215 racct_set(td
->td_proc
, RACCT_MEMLOCK
,
1216 ptoa(pmap_wired_count(map
->pmap
)));
1217 PROC_UNLOCK(td
->td_proc
);
1220 return (error
== KERN_SUCCESS
? 0 : ENOMEM
);
1226 * Helper function for vm_mmap. Perform sanity check specific for mmap
1227 * operations on vnodes.
1230 vm_mmap_vnode(struct thread
*td
, vm_size_t objsize
,
1231 vm_prot_t prot
, vm_prot_t
*maxprotp
, int *flagsp
,
1232 struct vnode
*vp
, vm_ooffset_t
*foffp
, vm_object_t
*objp
,
1233 boolean_t
*writecounted
)
1239 int error
, flags
, locktype
;
1241 cred
= td
->td_ucred
;
1242 if ((*maxprotp
& VM_PROT_WRITE
) && (*flagsp
& MAP_SHARED
))
1243 locktype
= LK_EXCLUSIVE
;
1245 locktype
= LK_SHARED
;
1246 if ((error
= vget(vp
, locktype
, td
)) != 0)
1248 AUDIT_ARG_VNODE1(vp
);
1252 if (vp
->v_type
== VREG
) {
1254 * Get the proper underlying object
1260 if (obj
->type
== OBJT_VNODE
&& obj
->handle
!= vp
) {
1262 vp
= (struct vnode
*)obj
->handle
;
1264 * Bypass filesystems obey the mpsafety of the
1265 * underlying fs. Tmpfs never bypasses.
1267 error
= vget(vp
, locktype
, td
);
1271 if (locktype
== LK_EXCLUSIVE
) {
1272 *writecounted
= TRUE
;
1273 vnode_pager_update_writecount(obj
, 0, objsize
);
1279 if ((error
= VOP_GETATTR(vp
, &va
, cred
)))
1282 /* This relies on VM_PROT_* matching PROT_*. */
1283 error
= mac_vnode_check_mmap(cred
, vp
, (int)prot
, flags
);
1287 if ((flags
& MAP_SHARED
) != 0) {
1288 if ((va
.va_flags
& (SF_SNAPSHOT
|IMMUTABLE
|APPEND
)) != 0) {
1289 if (prot
& VM_PROT_WRITE
) {
1293 *maxprotp
&= ~VM_PROT_WRITE
;
1297 * If it is a regular file without any references
1298 * we do not need to sync it.
1299 * Adjust object size to be the size of actual file.
1301 objsize
= round_page(va
.va_size
);
1302 if (va
.va_nlink
== 0)
1303 flags
|= MAP_NOSYNC
;
1304 if (obj
->type
== OBJT_VNODE
) {
1305 obj
= vm_pager_allocate(OBJT_VNODE
, vp
, objsize
, prot
, foff
,
1312 KASSERT(obj
->type
== OBJT_DEFAULT
|| obj
->type
== OBJT_SWAP
,
1313 ("wrong object type"));
1314 VM_OBJECT_WLOCK(obj
);
1315 vm_object_reference_locked(obj
);
1316 #if VM_NRESERVLEVEL > 0
1317 vm_object_color(obj
, 0);
1319 VM_OBJECT_WUNLOCK(obj
);
1324 vfs_mark_atime(vp
, cred
);
1327 if (error
!= 0 && *writecounted
) {
1328 *writecounted
= FALSE
;
1329 vnode_pager_update_writecount(obj
, objsize
, 0);
1340 * Helper function for vm_mmap. Perform sanity check specific for mmap
1341 * operations on cdevs.
1344 vm_mmap_cdev(struct thread
*td
, vm_size_t objsize
, vm_prot_t prot
,
1345 vm_prot_t
*maxprotp
, int *flagsp
, struct cdev
*cdev
, struct cdevsw
*dsw
,
1346 vm_ooffset_t
*foff
, vm_object_t
*objp
)
1353 if (dsw
->d_flags
& D_MMAP_ANON
) {
1356 *maxprotp
= VM_PROT_ALL
;
1357 *flagsp
|= MAP_ANON
;
1361 * cdevs do not provide private mappings of any kind.
1363 if ((*maxprotp
& VM_PROT_WRITE
) == 0 &&
1364 (prot
& VM_PROT_WRITE
) != 0)
1366 if (flags
& (MAP_PRIVATE
|MAP_COPY
))
1369 * Force device mappings to be shared.
1371 flags
|= MAP_SHARED
;
1373 error
= mac_cdev_check_mmap(td
->td_ucred
, cdev
, (int)prot
);
1378 * First, try d_mmap_single(). If that is not implemented
1379 * (returns ENODEV), fall back to using the device pager.
1380 * Note that d_mmap_single() must return a reference to the
1381 * object (it needs to bump the reference count of the object
1382 * it returns somehow).
1384 * XXX assumes VM_PROT_* == PROT_*
1386 error
= dsw
->d_mmap_single(cdev
, foff
, objsize
, objp
, (int)prot
);
1387 if (error
!= ENODEV
)
1389 obj
= vm_pager_allocate(OBJT_DEVICE
, cdev
, objsize
, prot
, *foff
,
1401 * Internal version of mmap used by exec, sys5 shared memory, and
1402 * various device drivers. Handle is either a vnode pointer, a
1403 * character device, or NULL for MAP_ANON.
1406 vm_mmap(vm_map_t map
, vm_offset_t
*addr
, vm_size_t size
, vm_prot_t prot
,
1407 vm_prot_t maxprot
, int flags
,
1408 objtype_t handle_type
, void *handle
,
1412 struct thread
*td
= curthread
;
1414 boolean_t writecounted
;
1419 size
= round_page(size
);
1421 writecounted
= FALSE
;
1424 * Lookup/allocate object.
1426 switch (handle_type
) {
1433 dsw
= dev_refthread(cdev
, &ref
);
1436 error
= vm_mmap_cdev(td
, size
, prot
, &maxprot
, &flags
, cdev
,
1437 dsw
, &foff
, &object
);
1438 dev_relthread(cdev
, ref
);
1442 error
= vm_mmap_vnode(td
, size
, prot
, &maxprot
, &flags
,
1443 handle
, &foff
, &object
, &writecounted
);
1446 if (handle
== NULL
) {
1458 error
= vm_mmap_object(map
, addr
, size
, prot
, maxprot
, flags
, object
,
1459 foff
, writecounted
, td
);
1460 if (error
!= 0 && object
!= NULL
) {
1462 * If this mapping was accounted for in the vnode's
1463 * writecount, then undo that now.
1466 vnode_pager_release_writecount(object
, 0, size
);
1467 vm_object_deallocate(object
);
1473 * Internal version of mmap that maps a specific VM object into an
1474 * map. Called by mmap for MAP_ANON, vm_mmap, shm_mmap, and vn_mmap.
1477 vm_mmap_object(vm_map_t map
, vm_offset_t
*addr
, vm_size_t size
, vm_prot_t prot
,
1478 vm_prot_t maxprot
, int flags
, vm_object_t object
, vm_ooffset_t foff
,
1479 boolean_t writecounted
, struct thread
*td
)
1482 int docow
, error
, findspace
, rv
;
1484 if (map
== &td
->td_proc
->p_vmspace
->vm_map
) {
1485 PROC_LOCK(td
->td_proc
);
1486 if (map
->size
+ size
> lim_cur_proc(td
->td_proc
, RLIMIT_VMEM
)) {
1487 PROC_UNLOCK(td
->td_proc
);
1490 if (racct_set(td
->td_proc
, RACCT_VMEM
, map
->size
+ size
)) {
1491 PROC_UNLOCK(td
->td_proc
);
1494 if (!old_mlock
&& map
->flags
& MAP_WIREFUTURE
) {
1495 if (ptoa(pmap_wired_count(map
->pmap
)) + size
>
1496 lim_cur_proc(td
->td_proc
, RLIMIT_MEMLOCK
)) {
1497 racct_set_force(td
->td_proc
, RACCT_VMEM
,
1499 PROC_UNLOCK(td
->td_proc
);
1502 error
= racct_set(td
->td_proc
, RACCT_MEMLOCK
,
1503 ptoa(pmap_wired_count(map
->pmap
)) + size
);
1505 racct_set_force(td
->td_proc
, RACCT_VMEM
,
1507 PROC_UNLOCK(td
->td_proc
);
1511 PROC_UNLOCK(td
->td_proc
);
1515 * We currently can only deal with page aligned file offsets.
1516 * The mmap() system call already enforces this by subtracting
1517 * the page offset from the file offset, but checking here
1518 * catches errors in device drivers (e.g. d_single_mmap()
1519 * callbacks) and other internal mapping requests (such as in
1522 if (foff
& PAGE_MASK
)
1525 if ((flags
& MAP_FIXED
) == 0) {
1527 *addr
= round_page(*addr
);
1529 if (*addr
!= trunc_page(*addr
))
1534 if (flags
& MAP_ANON
) {
1535 if (object
!= NULL
|| foff
!= 0)
1538 } else if (flags
& MAP_PREFAULT_READ
)
1539 docow
= MAP_PREFAULT
;
1541 docow
= MAP_PREFAULT_PARTIAL
;
1543 if ((flags
& (MAP_ANON
|MAP_SHARED
)) == 0)
1544 docow
|= MAP_COPY_ON_WRITE
;
1545 if (flags
& MAP_NOSYNC
)
1546 docow
|= MAP_DISABLE_SYNCER
;
1547 if (flags
& MAP_NOCORE
)
1548 docow
|= MAP_DISABLE_COREDUMP
;
1549 /* Shared memory is also shared with children. */
1550 if (flags
& MAP_SHARED
)
1551 docow
|= MAP_INHERIT_SHARE
;
1553 docow
|= MAP_VN_WRITECOUNT
;
1554 if (flags
& MAP_STACK
) {
1557 docow
|= MAP_STACK_GROWS_DOWN
;
1559 if ((flags
& MAP_EXCL
) != 0)
1560 docow
|= MAP_CHECK_EXCL
;
1563 if ((flags
& MAP_ALIGNMENT_MASK
) == MAP_ALIGNED_SUPER
)
1564 findspace
= VMFS_SUPER_SPACE
;
1565 else if ((flags
& MAP_ALIGNMENT_MASK
) != 0)
1566 findspace
= VMFS_ALIGNED_SPACE(flags
>>
1567 MAP_ALIGNMENT_SHIFT
);
1569 findspace
= VMFS_OPTIMAL_SPACE
;
1570 rv
= vm_map_find(map
, object
, foff
, addr
, size
,
1572 flags
& MAP_32BIT
? MAP_32BIT_MAX_ADDR
:
1574 0, findspace
, prot
, maxprot
, docow
);
1576 rv
= vm_map_fixed(map
, object
, foff
, *addr
, size
,
1577 prot
, maxprot
, docow
);
1580 if (rv
== KERN_SUCCESS
) {
1582 * If the process has requested that all future mappings
1583 * be wired, then heed this.
1585 if (map
->flags
& MAP_WIREFUTURE
) {
1586 vm_map_wire(map
, *addr
, *addr
+ size
,
1587 VM_MAP_WIRE_USER
| ((flags
& MAP_STACK
) ?
1588 VM_MAP_WIRE_HOLESOK
: VM_MAP_WIRE_NOHOLES
));
1591 return (vm_mmap_to_errno(rv
));
1595 * Translate a Mach VM return code to zero on success or the appropriate errno
1599 vm_mmap_to_errno(int rv
)
1605 case KERN_INVALID_ADDRESS
:
1608 case KERN_PROTECTION_FAILURE
: