2 * Copyright (c) 2006 Robert N. M. Watson
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 * Support for shared swap-backed anonymous memory objects via
29 * shm_open(2) and shm_unlink(2). While most of the implementation is
30 * here, vm_mmap.c contains mapping logic changes.
34 * (2) Need to export data to a userland tool via a sysctl. Should ipcs(1)
35 * and ipcrm(1) be expanded or should new tools to manage both POSIX
36 * kernel semaphores and POSIX shared memory be written?
38 * (3) Add support for this file type to fstat(1).
40 * (4) Resource limits? Does this need its own resource limits or are the
41 * existing limits in mmap(2) sufficient?
43 * (5) Partial page truncation. vnode_pager_setsize() will zero any parts
44 * of a partially mapped page as a result of ftruncate(2)/truncate(2).
45 * We can do the same (with the same pmap evil), but do we need to
46 * worry about the bits on disk if the page is swapped out or will the
47 * swapper zero the parts of a page that are invalid if the page is
48 * swapped back in for us?
50 * (6) Add MAC support in mac_biba(4) and mac_mls(4).
52 * (7) Add a MAC check_create() hook for creating new named objects.
55 #include <sys/cdefs.h>
56 __FBSDID("$FreeBSD$");
60 #include <sys/param.h>
61 #include <sys/fcntl.h>
63 #include <sys/filedesc.h>
64 #include <sys/fnv_hash.h>
65 #include <sys/kernel.h>
67 #include <sys/malloc.h>
69 #include <sys/mutex.h>
71 #include <sys/refcount.h>
72 #include <sys/resourcevar.h>
74 #include <sys/sysctl.h>
75 #include <sys/sysproto.h>
76 #include <sys/systm.h>
79 #include <sys/vnode.h>
81 #include <security/mac/mac_framework.h>
84 #include <vm/vm_param.h>
86 #include <vm/vm_map.h>
87 #include <vm/vm_object.h>
88 #include <vm/vm_page.h>
89 #include <vm/vm_pager.h>
90 #include <vm/swap_pager.h>
95 struct shmfd
*sm_shmfd
;
96 LIST_ENTRY(shm_mapping
) sm_link
;
99 static MALLOC_DEFINE(M_SHMFD
, "shmfd", "shared memory file descriptor");
100 static LIST_HEAD(, shm_mapping
) *shm_dictionary
;
101 static struct sx shm_dict_lock
;
102 static struct mtx shm_timestamp_lock
;
103 static u_long shm_hash
;
105 #define SHM_HASH(fnv) (&shm_dictionary[(fnv) & shm_hash])
107 static int shm_access(struct shmfd
*shmfd
, struct ucred
*ucred
, int flags
);
108 static struct shmfd
*shm_alloc(struct ucred
*ucred
, mode_t mode
);
109 static void shm_dict_init(void *arg
);
110 static void shm_drop(struct shmfd
*shmfd
);
111 static struct shmfd
*shm_hold(struct shmfd
*shmfd
);
112 static void shm_insert(char *path
, Fnv32_t fnv
, struct shmfd
*shmfd
);
113 static struct shmfd
*shm_lookup(char *path
, Fnv32_t fnv
);
114 static int shm_remove(char *path
, Fnv32_t fnv
, struct ucred
*ucred
);
115 static void shm_dotruncate(struct shmfd
*shmfd
, off_t length
);
117 static fo_rdwr_t shm_read
;
118 static fo_rdwr_t shm_write
;
119 static fo_truncate_t shm_truncate
;
120 static fo_ioctl_t shm_ioctl
;
121 static fo_poll_t shm_poll
;
122 static fo_kqfilter_t shm_kqfilter
;
123 static fo_stat_t shm_stat
;
124 static fo_close_t shm_close
;
126 /* File descriptor operations. */
127 static struct fileops shm_ops
= {
129 .fo_write
= shm_write
,
130 .fo_truncate
= shm_truncate
,
131 .fo_ioctl
= shm_ioctl
,
133 .fo_kqfilter
= shm_kqfilter
,
135 .fo_close
= shm_close
,
136 .fo_flags
= DFLAG_PASSABLE
139 FEATURE(posix_shm
, "POSIX shared memory");
142 shm_read(struct file
*fp
, struct uio
*uio
, struct ucred
*active_cred
,
143 int flags
, struct thread
*td
)
150 shm_write(struct file
*fp
, struct uio
*uio
, struct ucred
*active_cred
,
151 int flags
, struct thread
*td
)
158 shm_truncate(struct file
*fp
, off_t length
, struct ucred
*active_cred
,
168 error
= mac_posixshm_check_truncate(active_cred
, fp
->f_cred
, shmfd
);
172 shm_dotruncate(shmfd
, length
);
177 shm_ioctl(struct file
*fp
, u_long com
, void *data
,
178 struct ucred
*active_cred
, struct thread
*td
)
185 shm_poll(struct file
*fp
, int events
, struct ucred
*active_cred
,
193 shm_kqfilter(struct file
*fp
, struct knote
*kn
)
200 shm_stat(struct file
*fp
, struct stat
*sb
, struct ucred
*active_cred
,
211 error
= mac_posixshm_check_stat(active_cred
, fp
->f_cred
, shmfd
);
217 * Attempt to return sanish values for fstat() on a memory file
220 bzero(sb
, sizeof(*sb
));
221 sb
->st_mode
= S_IFREG
| shmfd
->shm_mode
; /* XXX */
222 sb
->st_blksize
= PAGE_SIZE
;
223 sb
->st_size
= shmfd
->shm_size
;
224 sb
->st_blocks
= (sb
->st_size
+ sb
->st_blksize
- 1) / sb
->st_blksize
;
225 sb
->st_atimespec
= shmfd
->shm_atime
;
226 sb
->st_ctimespec
= shmfd
->shm_ctime
;
227 sb
->st_mtimespec
= shmfd
->shm_mtime
;
228 sb
->st_birthtimespec
= shmfd
->shm_birthtime
;
229 sb
->st_uid
= shmfd
->shm_uid
;
230 sb
->st_gid
= shmfd
->shm_gid
;
236 shm_close(struct file
*fp
, struct thread
*td
)
248 shm_dotruncate(struct shmfd
*shmfd
, off_t length
)
252 vm_pindex_t nobjsize
;
254 object
= shmfd
->shm_object
;
255 VM_OBJECT_LOCK(object
);
256 if (length
== shmfd
->shm_size
) {
257 VM_OBJECT_UNLOCK(object
);
260 nobjsize
= OFF_TO_IDX(length
+ PAGE_MASK
);
262 /* Are we shrinking? If so, trim the end. */
263 if (length
< shmfd
->shm_size
) {
264 /* Toss in memory pages. */
265 if (nobjsize
< object
->size
)
266 vm_object_page_remove(object
, nobjsize
, object
->size
,
269 /* Toss pages from swap. */
270 if (object
->type
== OBJT_SWAP
)
271 swap_pager_freespace(object
, nobjsize
,
272 object
->size
- nobjsize
);
275 * If the last page is partially mapped, then zero out
276 * the garbage at the end of the page. See comments
277 * in vnode_page_setsize() for more details.
279 * XXXJHB: This handles in memory pages, but what about
280 * a page swapped out to disk?
282 if ((length
& PAGE_MASK
) &&
283 (m
= vm_page_lookup(object
, OFF_TO_IDX(length
))) != NULL
&&
285 int base
= (int)length
& PAGE_MASK
;
286 int size
= PAGE_SIZE
- base
;
288 pmap_zero_page_area(m
, base
, size
);
289 vm_page_lock_queues();
290 vm_page_set_validclean(m
, base
, size
);
292 m
->dirty
= VM_PAGE_BITS_ALL
;
293 vm_page_unlock_queues();
294 } else if ((length
& PAGE_MASK
) &&
295 __predict_false(object
->cache
!= NULL
)) {
296 vm_page_cache_free(object
, OFF_TO_IDX(length
),
300 shmfd
->shm_size
= length
;
301 mtx_lock(&shm_timestamp_lock
);
302 vfs_timestamp(&shmfd
->shm_ctime
);
303 shmfd
->shm_mtime
= shmfd
->shm_ctime
;
304 mtx_unlock(&shm_timestamp_lock
);
305 object
->size
= nobjsize
;
306 VM_OBJECT_UNLOCK(object
);
310 * shmfd object management including creation and reference counting
313 static struct shmfd
*
314 shm_alloc(struct ucred
*ucred
, mode_t mode
)
318 shmfd
= malloc(sizeof(*shmfd
), M_SHMFD
, M_WAITOK
| M_ZERO
);
320 shmfd
->shm_uid
= ucred
->cr_uid
;
321 shmfd
->shm_gid
= ucred
->cr_gid
;
322 shmfd
->shm_mode
= mode
;
323 shmfd
->shm_object
= vm_pager_allocate(OBJT_DEFAULT
, NULL
,
324 shmfd
->shm_size
, VM_PROT_DEFAULT
, 0);
325 KASSERT(shmfd
->shm_object
!= NULL
, ("shm_create: vm_pager_allocate"));
326 VM_OBJECT_LOCK(shmfd
->shm_object
);
327 vm_object_clear_flag(shmfd
->shm_object
, OBJ_ONEMAPPING
);
328 vm_object_set_flag(shmfd
->shm_object
, OBJ_NOSPLIT
);
329 VM_OBJECT_UNLOCK(shmfd
->shm_object
);
330 vfs_timestamp(&shmfd
->shm_birthtime
);
331 shmfd
->shm_atime
= shmfd
->shm_mtime
= shmfd
->shm_ctime
=
332 shmfd
->shm_birthtime
;
333 refcount_init(&shmfd
->shm_refs
, 1);
335 mac_posixshm_init(shmfd
);
336 mac_posixshm_create(ucred
, shmfd
);
342 static struct shmfd
*
343 shm_hold(struct shmfd
*shmfd
)
346 refcount_acquire(&shmfd
->shm_refs
);
351 shm_drop(struct shmfd
*shmfd
)
354 if (refcount_release(&shmfd
->shm_refs
)) {
356 mac_posixshm_destroy(shmfd
);
358 vm_object_deallocate(shmfd
->shm_object
);
359 free(shmfd
, M_SHMFD
);
364 * Determine if the credentials have sufficient permissions for a
365 * specified combination of FREAD and FWRITE.
368 shm_access(struct shmfd
*shmfd
, struct ucred
*ucred
, int flags
)
377 return (vaccess(VREG
, shmfd
->shm_mode
, shmfd
->shm_uid
, shmfd
->shm_gid
,
378 acc_mode
, ucred
, NULL
));
382 * Dictionary management. We maintain an in-kernel dictionary to map
383 * paths to shmfd objects. We use the FNV hash on the path to store
384 * the mappings in a hash table.
387 shm_dict_init(void *arg
)
390 mtx_init(&shm_timestamp_lock
, "shm timestamps", NULL
, MTX_DEF
);
391 sx_init(&shm_dict_lock
, "shm dictionary");
392 shm_dictionary
= hashinit(1024, M_SHMFD
, &shm_hash
);
394 SYSINIT(shm_dict_init
, SI_SUB_SYSV_SHM
, SI_ORDER_ANY
, shm_dict_init
, NULL
);
396 static struct shmfd
*
397 shm_lookup(char *path
, Fnv32_t fnv
)
399 struct shm_mapping
*map
;
401 LIST_FOREACH(map
, SHM_HASH(fnv
), sm_link
) {
402 if (map
->sm_fnv
!= fnv
)
404 if (strcmp(map
->sm_path
, path
) == 0)
405 return (map
->sm_shmfd
);
412 shm_insert(char *path
, Fnv32_t fnv
, struct shmfd
*shmfd
)
414 struct shm_mapping
*map
;
416 map
= malloc(sizeof(struct shm_mapping
), M_SHMFD
, M_WAITOK
);
419 map
->sm_shmfd
= shm_hold(shmfd
);
420 LIST_INSERT_HEAD(SHM_HASH(fnv
), map
, sm_link
);
424 shm_remove(char *path
, Fnv32_t fnv
, struct ucred
*ucred
)
426 struct shm_mapping
*map
;
429 LIST_FOREACH(map
, SHM_HASH(fnv
), sm_link
) {
430 if (map
->sm_fnv
!= fnv
)
432 if (strcmp(map
->sm_path
, path
) == 0) {
434 error
= mac_posixshm_check_unlink(ucred
, map
->sm_shmfd
);
438 error
= shm_access(map
->sm_shmfd
, ucred
,
442 LIST_REMOVE(map
, sm_link
);
443 shm_drop(map
->sm_shmfd
);
444 free(map
->sm_path
, M_SHMFD
);
455 shm_open(struct thread
*td
, struct shm_open_args
*uap
)
457 struct filedesc
*fdp
;
465 if ((uap
->flags
& O_ACCMODE
) != O_RDONLY
&&
466 (uap
->flags
& O_ACCMODE
) != O_RDWR
)
469 if ((uap
->flags
& ~(O_ACCMODE
| O_CREAT
| O_EXCL
| O_TRUNC
)) != 0)
472 fdp
= td
->td_proc
->p_fd
;
473 cmode
= (uap
->mode
& ~fdp
->fd_cmask
) & ACCESSPERMS
;
475 error
= falloc(td
, &fp
, &fd
);
479 /* A SHM_ANON path pointer creates an anonymous object. */
480 if (uap
->path
== SHM_ANON
) {
481 /* A read-only anonymous object is pointless. */
482 if ((uap
->flags
& O_ACCMODE
) == O_RDONLY
) {
483 fdclose(fdp
, fp
, fd
, td
);
487 shmfd
= shm_alloc(td
->td_ucred
, cmode
);
489 path
= malloc(MAXPATHLEN
, M_SHMFD
, M_WAITOK
);
490 error
= copyinstr(uap
->path
, path
, MAXPATHLEN
, NULL
);
492 /* Require paths to start with a '/' character. */
493 if (error
== 0 && path
[0] != '/')
496 fdclose(fdp
, fp
, fd
, td
);
502 fnv
= fnv_32_str(path
, FNV1_32_INIT
);
503 sx_xlock(&shm_dict_lock
);
504 shmfd
= shm_lookup(path
, fnv
);
506 /* Object does not yet exist, create it if requested. */
507 if (uap
->flags
& O_CREAT
) {
508 shmfd
= shm_alloc(td
->td_ucred
, cmode
);
509 shm_insert(path
, fnv
, shmfd
);
516 * Object already exists, obtain a new
517 * reference if requested and permitted.
520 if ((uap
->flags
& (O_CREAT
| O_EXCL
)) ==
525 error
= mac_posixshm_check_open(td
->td_ucred
,
529 error
= shm_access(shmfd
, td
->td_ucred
,
530 FFLAGS(uap
->flags
& O_ACCMODE
));
534 * Truncate the file back to zero length if
535 * O_TRUNC was specified and the object was
536 * opened with read/write.
539 (uap
->flags
& (O_ACCMODE
| O_TRUNC
)) ==
540 (O_RDWR
| O_TRUNC
)) {
542 error
= mac_posixshm_check_truncate(
543 td
->td_ucred
, fp
->f_cred
, shmfd
);
546 shm_dotruncate(shmfd
, 0);
551 sx_xunlock(&shm_dict_lock
);
554 fdclose(fdp
, fp
, fd
, td
);
560 finit(fp
, FFLAGS(uap
->flags
& O_ACCMODE
), DTYPE_SHM
, shmfd
, &shm_ops
);
563 if (fdp
->fd_ofiles
[fd
] == fp
)
564 fdp
->fd_ofileflags
[fd
] |= UF_EXCLOSE
;
565 FILEDESC_XUNLOCK(fdp
);
566 td
->td_retval
[0] = fd
;
573 shm_unlink(struct thread
*td
, struct shm_unlink_args
*uap
)
579 path
= malloc(MAXPATHLEN
, M_TEMP
, M_WAITOK
);
580 error
= copyinstr(uap
->path
, path
, MAXPATHLEN
, NULL
);
586 fnv
= fnv_32_str(path
, FNV1_32_INIT
);
587 sx_xlock(&shm_dict_lock
);
588 error
= shm_remove(path
, fnv
, td
->td_ucred
);
589 sx_xunlock(&shm_dict_lock
);
596 * mmap() helper to validate mmap() requests against shm object state
597 * and give mmap() the vm_object to use for the mapping.
600 shm_mmap(struct shmfd
*shmfd
, vm_size_t objsize
, vm_ooffset_t foff
,
605 * XXXRW: This validation is probably insufficient, and subject to
606 * sign errors. It should be fixed.
608 if (foff
>= shmfd
->shm_size
|| foff
+ objsize
> shmfd
->shm_size
)
611 mtx_lock(&shm_timestamp_lock
);
612 vfs_timestamp(&shmfd
->shm_atime
);
613 mtx_unlock(&shm_timestamp_lock
);
614 vm_object_reference(shmfd
->shm_object
);
615 *obj
= shmfd
->shm_object
;