2 * Copyright (c) 1982, 1986, 1989, 1993
3 * The Regents of the University of California. All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. Neither the name of the University nor the names of its contributors
19 * may be used to endorse or promote products derived from this software
20 * without specific prior written permission.
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * @(#)sys_generic.c 8.5 (Berkeley) 1/21/94
35 * $FreeBSD: src/sys/kern/sys_generic.c,v 1.55.2.10 2001/03/17 10:39:32 peter Exp $
38 #include "opt_ktrace.h"
40 #include <sys/param.h>
41 #include <sys/systm.h>
42 #include <sys/sysproto.h>
43 #include <sys/event.h>
44 #include <sys/filedesc.h>
45 #include <sys/filio.h>
46 #include <sys/fcntl.h>
49 #include <sys/signalvar.h>
50 #include <sys/socketvar.h>
51 #include <sys/malloc.h>
53 #include <sys/kernel.h>
54 #include <sys/kern_syscall.h>
55 #include <sys/mapped_ioctl.h>
57 #include <sys/queue.h>
58 #include <sys/resourcevar.h>
59 #include <sys/socketops.h>
60 #include <sys/sysctl.h>
61 #include <sys/sysent.h>
64 #include <sys/ktrace.h>
67 #include <vm/vm_page.h>
69 #include <sys/file2.h>
70 #include <sys/spinlock2.h>
72 #include <machine/limits.h>
74 static MALLOC_DEFINE(M_IOCTLOPS
, "ioctlops", "ioctl data buffer");
75 static MALLOC_DEFINE(M_IOCTLMAP
, "ioctlmap", "mapped ioctl handler buffer");
76 static MALLOC_DEFINE(M_SELECT
, "select", "select() buffer");
77 MALLOC_DEFINE(M_IOV
, "iov", "large iov's");
79 typedef struct kfd_set
{
83 enum select_copyin_states
{
84 COPYIN_READ
, COPYIN_WRITE
, COPYIN_EXCEPT
, COPYIN_DONE
};
86 struct select_kevent_copyin_args
{
90 int active_set
; /* One of select_copyin_states */
91 struct lwp
*lwp
; /* Pointer to our lwp */
92 int num_fds
; /* Number of file descriptors (syscall arg) */
93 int proc_fds
; /* Processed fd's (wraps) */
94 int error
; /* Returned to userland */
97 struct poll_kevent_copyin_args
{
105 static struct lwkt_token mioctl_token
= LWKT_TOKEN_INITIALIZER(mioctl_token
);
107 static int doselect(int nd
, fd_set
*in
, fd_set
*ou
, fd_set
*ex
,
108 struct timespec
*ts
, int *res
);
109 static int dopoll(int nfds
, struct pollfd
*fds
, struct timespec
*ts
,
110 int *res
, int flags
);
111 static int dofileread(int, struct file
*, struct uio
*, int, size_t *);
112 static int dofilewrite(int, struct file
*, struct uio
*, int, size_t *);
120 sys_read(struct read_args
*uap
)
122 struct thread
*td
= curthread
;
127 if ((ssize_t
)uap
->nbyte
< 0)
130 aiov
.iov_base
= uap
->buf
;
131 aiov
.iov_len
= uap
->nbyte
;
132 auio
.uio_iov
= &aiov
;
134 auio
.uio_offset
= -1;
135 auio
.uio_resid
= uap
->nbyte
;
136 auio
.uio_rw
= UIO_READ
;
137 auio
.uio_segflg
= UIO_USERSPACE
;
140 error
= kern_preadv(uap
->fd
, &auio
, 0, &uap
->sysmsg_szresult
);
145 * Positioned (Pread) read system call
150 sys_extpread(struct extpread_args
*uap
)
152 struct thread
*td
= curthread
;
158 if ((ssize_t
)uap
->nbyte
< 0)
161 aiov
.iov_base
= uap
->buf
;
162 aiov
.iov_len
= uap
->nbyte
;
163 auio
.uio_iov
= &aiov
;
165 auio
.uio_offset
= uap
->offset
;
166 auio
.uio_resid
= uap
->nbyte
;
167 auio
.uio_rw
= UIO_READ
;
168 auio
.uio_segflg
= UIO_USERSPACE
;
171 flags
= uap
->flags
& O_FMASK
;
172 if (uap
->offset
!= (off_t
)-1)
175 error
= kern_preadv(uap
->fd
, &auio
, flags
, &uap
->sysmsg_szresult
);
180 * Scatter read system call.
185 sys_readv(struct readv_args
*uap
)
187 struct thread
*td
= curthread
;
189 struct iovec aiov
[UIO_SMALLIOV
], *iov
= NULL
;
192 error
= iovec_copyin(uap
->iovp
, &iov
, aiov
, uap
->iovcnt
,
197 auio
.uio_iovcnt
= uap
->iovcnt
;
198 auio
.uio_offset
= -1;
199 auio
.uio_rw
= UIO_READ
;
200 auio
.uio_segflg
= UIO_USERSPACE
;
203 error
= kern_preadv(uap
->fd
, &auio
, 0, &uap
->sysmsg_szresult
);
205 iovec_free(&iov
, aiov
);
211 * Scatter positioned read system call.
216 sys_extpreadv(struct extpreadv_args
*uap
)
218 struct thread
*td
= curthread
;
220 struct iovec aiov
[UIO_SMALLIOV
], *iov
= NULL
;
224 error
= iovec_copyin(uap
->iovp
, &iov
, aiov
, uap
->iovcnt
,
229 auio
.uio_iovcnt
= uap
->iovcnt
;
230 auio
.uio_offset
= uap
->offset
;
231 auio
.uio_rw
= UIO_READ
;
232 auio
.uio_segflg
= UIO_USERSPACE
;
235 flags
= uap
->flags
& O_FMASK
;
236 if (uap
->offset
!= (off_t
)-1)
239 error
= kern_preadv(uap
->fd
, &auio
, flags
, &uap
->sysmsg_szresult
);
241 iovec_free(&iov
, aiov
);
249 kern_preadv(int fd
, struct uio
*auio
, int flags
, size_t *res
)
251 struct thread
*td
= curthread
;
255 fp
= holdfp(td
, fd
, FREAD
);
258 if (flags
& O_FOFFSET
&& fp
->f_type
!= DTYPE_VNODE
) {
261 error
= dofileread(fd
, fp
, auio
, flags
, res
);
269 * Common code for readv and preadv that reads data in
270 * from a file using the passed in uio, offset, and flags.
272 * MPALMOSTSAFE - ktrace needs help
275 dofileread(int fd
, struct file
*fp
, struct uio
*auio
, int flags
, size_t *res
)
280 struct thread
*td
= curthread
;
281 struct iovec
*ktriov
= NULL
;
287 * if tracing, save a copy of iovec
289 if (KTRPOINT(td
, KTR_GENIO
)) {
290 int iovlen
= auio
->uio_iovcnt
* sizeof(struct iovec
);
292 ktriov
= kmalloc(iovlen
, M_TEMP
, M_WAITOK
);
293 bcopy((caddr_t
)auio
->uio_iov
, (caddr_t
)ktriov
, iovlen
);
297 len
= auio
->uio_resid
;
298 error
= fo_read(fp
, auio
, fp
->f_cred
, flags
);
300 if (auio
->uio_resid
!= len
&& (error
== ERESTART
||
301 error
== EINTR
|| error
== EWOULDBLOCK
))
305 if (ktriov
!= NULL
) {
307 ktruio
.uio_iov
= ktriov
;
308 ktruio
.uio_resid
= len
- auio
->uio_resid
;
309 ktrgenio(td
->td_lwp
, fd
, UIO_READ
, &ktruio
, error
);
311 kfree(ktriov
, M_TEMP
);
315 *res
= len
- auio
->uio_resid
;
326 sys_write(struct write_args
*uap
)
328 struct thread
*td
= curthread
;
333 if ((ssize_t
)uap
->nbyte
< 0)
336 aiov
.iov_base
= (void *)(uintptr_t)uap
->buf
;
337 aiov
.iov_len
= uap
->nbyte
;
338 auio
.uio_iov
= &aiov
;
340 auio
.uio_offset
= -1;
341 auio
.uio_resid
= uap
->nbyte
;
342 auio
.uio_rw
= UIO_WRITE
;
343 auio
.uio_segflg
= UIO_USERSPACE
;
346 error
= kern_pwritev(uap
->fd
, &auio
, 0, &uap
->sysmsg_szresult
);
357 sys_extpwrite(struct extpwrite_args
*uap
)
359 struct thread
*td
= curthread
;
365 if ((ssize_t
)uap
->nbyte
< 0)
368 aiov
.iov_base
= (void *)(uintptr_t)uap
->buf
;
369 aiov
.iov_len
= uap
->nbyte
;
370 auio
.uio_iov
= &aiov
;
372 auio
.uio_offset
= uap
->offset
;
373 auio
.uio_resid
= uap
->nbyte
;
374 auio
.uio_rw
= UIO_WRITE
;
375 auio
.uio_segflg
= UIO_USERSPACE
;
378 flags
= uap
->flags
& O_FMASK
;
379 if (uap
->offset
!= (off_t
)-1)
381 error
= kern_pwritev(uap
->fd
, &auio
, flags
, &uap
->sysmsg_szresult
);
389 sys_writev(struct writev_args
*uap
)
391 struct thread
*td
= curthread
;
393 struct iovec aiov
[UIO_SMALLIOV
], *iov
= NULL
;
396 error
= iovec_copyin(uap
->iovp
, &iov
, aiov
, uap
->iovcnt
,
401 auio
.uio_iovcnt
= uap
->iovcnt
;
402 auio
.uio_offset
= -1;
403 auio
.uio_rw
= UIO_WRITE
;
404 auio
.uio_segflg
= UIO_USERSPACE
;
407 error
= kern_pwritev(uap
->fd
, &auio
, 0, &uap
->sysmsg_szresult
);
409 iovec_free(&iov
, aiov
);
415 * Gather positioned write system call
420 sys_extpwritev(struct extpwritev_args
*uap
)
422 struct thread
*td
= curthread
;
424 struct iovec aiov
[UIO_SMALLIOV
], *iov
= NULL
;
428 error
= iovec_copyin(uap
->iovp
, &iov
, aiov
, uap
->iovcnt
,
433 auio
.uio_iovcnt
= uap
->iovcnt
;
434 auio
.uio_offset
= uap
->offset
;
435 auio
.uio_rw
= UIO_WRITE
;
436 auio
.uio_segflg
= UIO_USERSPACE
;
439 flags
= uap
->flags
& O_FMASK
;
440 if (uap
->offset
!= (off_t
)-1)
443 error
= kern_pwritev(uap
->fd
, &auio
, flags
, &uap
->sysmsg_szresult
);
445 iovec_free(&iov
, aiov
);
453 kern_pwritev(int fd
, struct uio
*auio
, int flags
, size_t *res
)
455 struct thread
*td
= curthread
;
459 fp
= holdfp(td
, fd
, FWRITE
);
462 else if ((flags
& O_FOFFSET
) && fp
->f_type
!= DTYPE_VNODE
) {
465 error
= dofilewrite(fd
, fp
, auio
, flags
, res
);
473 * Common code for writev and pwritev that writes data to
474 * a file using the passed in uio, offset, and flags.
476 * MPALMOSTSAFE - ktrace needs help
479 dofilewrite(int fd
, struct file
*fp
, struct uio
*auio
, int flags
, size_t *res
)
481 struct thread
*td
= curthread
;
482 struct lwp
*lp
= td
->td_lwp
;
486 struct iovec
*ktriov
= NULL
;
492 * if tracing, save a copy of iovec and uio
494 if (KTRPOINT(td
, KTR_GENIO
)) {
495 int iovlen
= auio
->uio_iovcnt
* sizeof(struct iovec
);
497 ktriov
= kmalloc(iovlen
, M_TEMP
, M_WAITOK
);
498 bcopy((caddr_t
)auio
->uio_iov
, (caddr_t
)ktriov
, iovlen
);
502 len
= auio
->uio_resid
;
503 error
= fo_write(fp
, auio
, fp
->f_cred
, flags
);
505 if (auio
->uio_resid
!= len
&& (error
== ERESTART
||
506 error
== EINTR
|| error
== EWOULDBLOCK
))
508 /* Socket layer is responsible for issuing SIGPIPE. */
509 if (error
== EPIPE
&& fp
->f_type
!= DTYPE_SOCKET
)
510 lwpsignal(lp
->lwp_proc
, lp
, SIGPIPE
);
513 if (ktriov
!= NULL
) {
515 ktruio
.uio_iov
= ktriov
;
516 ktruio
.uio_resid
= len
- auio
->uio_resid
;
517 ktrgenio(lp
, fd
, UIO_WRITE
, &ktruio
, error
);
519 kfree(ktriov
, M_TEMP
);
523 *res
= len
- auio
->uio_resid
;
534 sys_ioctl(struct ioctl_args
*uap
)
538 error
= mapped_ioctl(uap
->fd
, uap
->com
, uap
->data
, NULL
, &uap
->sysmsg
);
542 struct ioctl_map_entry
{
544 struct ioctl_map_range
*cmd_ranges
;
545 LIST_ENTRY(ioctl_map_entry
) entries
;
549 * The true heart of all ioctl syscall handlers (native, emulation).
550 * If map != NULL, it will be searched for a matching entry for com,
551 * and appropriate conversions/conversion functions will be utilized.
556 mapped_ioctl(int fd
, u_long com
, caddr_t uspc_data
, struct ioctl_map
*map
,
559 struct thread
*td
= curthread
;
560 struct proc
*p
= td
->td_proc
;
563 struct ioctl_map_range
*iomc
= NULL
;
569 #define STK_PARAMS 128
571 char stkbuf
[STK_PARAMS
];
579 fp
= holdfp(td
, fd
, FREAD
|FWRITE
);
583 if (map
!= NULL
) { /* obey translation map */
585 struct ioctl_map_entry
*e
;
587 maskcmd
= com
& map
->mask
;
589 lwkt_gettoken(&mioctl_token
);
590 LIST_FOREACH(e
, &map
->mapping
, entries
) {
591 for (iomc
= e
->cmd_ranges
; iomc
->start
!= 0 ||
592 iomc
->maptocmd
!= 0 || iomc
->wrapfunc
!= NULL
||
593 iomc
->mapfunc
!= NULL
;
595 if (maskcmd
>= iomc
->start
&&
596 maskcmd
<= iomc
->end
)
600 /* Did we find a match? */
601 if (iomc
->start
!= 0 || iomc
->maptocmd
!= 0 ||
602 iomc
->wrapfunc
!= NULL
|| iomc
->mapfunc
!= NULL
)
605 lwkt_reltoken(&mioctl_token
);
608 (iomc
->start
== 0 && iomc
->maptocmd
== 0
609 && iomc
->wrapfunc
== NULL
&& iomc
->mapfunc
== NULL
)) {
610 kprintf("%s: 'ioctl' fd=%d, cmd=0x%lx ('%c',%d) not implemented\n",
611 map
->sys
, fd
, maskcmd
,
612 (int)((maskcmd
>> 8) & 0xff),
613 (int)(maskcmd
& 0xff));
619 * If it's a non-range one to one mapping, maptocmd should be
620 * correct. If it's a ranged one to one mapping, we pass the
621 * original value of com, and for a range mapped to a different
622 * range, we always need a mapping function to translate the
623 * ioctl to our native ioctl. Ex. 6500-65ff <-> 9500-95ff
625 if (iomc
->start
== iomc
->end
&& iomc
->maptocmd
== iomc
->maptoend
) {
626 com
= iomc
->maptocmd
;
627 } else if (iomc
->start
== iomc
->maptocmd
&& iomc
->end
== iomc
->maptoend
) {
628 if (iomc
->mapfunc
!= NULL
)
629 com
= iomc
->mapfunc(iomc
->start
, iomc
->end
,
630 iomc
->start
, iomc
->end
,
633 if (iomc
->mapfunc
!= NULL
) {
634 com
= iomc
->mapfunc(iomc
->start
, iomc
->end
,
635 iomc
->maptocmd
, iomc
->maptoend
,
638 kprintf("%s: Invalid mapping for fd=%d, cmd=%#lx ('%c',%d)\n",
639 map
->sys
, fd
, maskcmd
,
640 (int)((maskcmd
>> 8) & 0xff),
641 (int)(maskcmd
& 0xff));
650 error
= fclrfdflags(p
->p_fd
, fd
, UF_EXCLOSE
);
653 error
= fsetfdflags(p
->p_fd
, fd
, UF_EXCLOSE
);
658 * Interpret high order word to find amount of data to be
659 * copied to/from the user's address space.
661 size
= IOCPARM_LEN(com
);
662 if (size
> IOCPARM_MAX
) {
667 if ((com
& IOC_VOID
) == 0 && size
> sizeof(ubuf
.stkbuf
)) {
668 memp
= kmalloc(size
, M_IOCTLOPS
, M_WAITOK
);
674 if (com
& IOC_VOID
) {
675 *(caddr_t
*)data
= uspc_data
;
676 } else if (com
& IOC_IN
) {
678 error
= copyin(uspc_data
, data
, (size_t)size
);
682 *(caddr_t
*)data
= uspc_data
;
684 } else if ((com
& IOC_OUT
) != 0 && size
) {
686 * Zero the buffer so the user always
687 * gets back something deterministic.
689 bzero(data
, (size_t)size
);
694 if ((tmp
= *(int *)data
))
695 atomic_set_int(&fp
->f_flag
, FNONBLOCK
);
697 atomic_clear_int(&fp
->f_flag
, FNONBLOCK
);
702 if ((tmp
= *(int *)data
))
703 atomic_set_int(&fp
->f_flag
, FASYNC
);
705 atomic_clear_int(&fp
->f_flag
, FASYNC
);
706 error
= fo_ioctl(fp
, FIOASYNC
, (caddr_t
)&tmp
, cred
, msg
);
711 * If there is a override function,
712 * call it instead of directly routing the call
714 if (map
!= NULL
&& iomc
->wrapfunc
!= NULL
)
715 error
= iomc
->wrapfunc(fp
, com
, ocom
, data
, cred
);
717 error
= fo_ioctl(fp
, com
, data
, cred
, msg
);
719 * Copy any data to user, size was
720 * already set and checked above.
722 if (error
== 0 && (com
& IOC_OUT
) != 0 && size
!= 0)
723 error
= copyout(data
, uspc_data
, (size_t)size
);
728 kfree(memp
, M_IOCTLOPS
);
738 mapped_ioctl_register_handler(struct ioctl_map_handler
*he
)
740 struct ioctl_map_entry
*ne
;
742 KKASSERT(he
!= NULL
&& he
->map
!= NULL
&& he
->cmd_ranges
!= NULL
&&
743 he
->subsys
!= NULL
&& *he
->subsys
!= '\0');
745 ne
= kmalloc(sizeof(struct ioctl_map_entry
), M_IOCTLMAP
,
748 ne
->subsys
= he
->subsys
;
749 ne
->cmd_ranges
= he
->cmd_ranges
;
751 lwkt_gettoken(&mioctl_token
);
752 LIST_INSERT_HEAD(&he
->map
->mapping
, ne
, entries
);
753 lwkt_reltoken(&mioctl_token
);
762 mapped_ioctl_unregister_handler(struct ioctl_map_handler
*he
)
764 struct ioctl_map_entry
*ne
;
767 KKASSERT(he
!= NULL
&& he
->map
!= NULL
&& he
->cmd_ranges
!= NULL
);
769 lwkt_gettoken(&mioctl_token
);
770 LIST_FOREACH(ne
, &he
->map
->mapping
, entries
) {
771 if (ne
->cmd_ranges
== he
->cmd_ranges
) {
772 LIST_REMOVE(ne
, entries
);
773 kfree(ne
, M_IOCTLMAP
);
778 lwkt_reltoken(&mioctl_token
);
782 static int nselcoll
; /* Select collisions since boot */
784 SYSCTL_INT(_kern
, OID_AUTO
, nselcoll
, CTLFLAG_RD
, &nselcoll
, 0, "");
785 static int nseldebug
;
786 SYSCTL_INT(_kern
, OID_AUTO
, nseldebug
, CTLFLAG_RW
, &nseldebug
, 0, "");
789 * Select system call.
794 sys_select(struct select_args
*uap
)
797 struct timespec
*ktsp
, kts
;
801 * Get timeout if any.
803 if (uap
->tv
!= NULL
) {
804 error
= copyin(uap
->tv
, &ktv
, sizeof (ktv
));
807 TIMEVAL_TO_TIMESPEC(&ktv
, &kts
);
816 error
= doselect(uap
->nd
, uap
->in
, uap
->ou
, uap
->ex
, ktsp
,
817 &uap
->sysmsg_result
);
824 * Pselect system call.
827 sys_pselect(struct pselect_args
*uap
)
829 struct thread
*td
= curthread
;
830 struct lwp
*lp
= td
->td_lwp
;
831 struct timespec
*ktsp
, kts
;
836 * Get timeout if any.
838 if (uap
->ts
!= NULL
) {
839 error
= copyin(uap
->ts
, &kts
, sizeof (kts
));
848 * Install temporary signal mask if any provided.
850 if (uap
->sigmask
!= NULL
) {
851 error
= copyin(uap
->sigmask
, &sigmask
, sizeof(sigmask
));
854 lwkt_gettoken(&lp
->lwp_proc
->p_token
);
855 lp
->lwp_oldsigmask
= lp
->lwp_sigmask
;
856 SIG_CANTMASK(sigmask
);
857 lp
->lwp_sigmask
= sigmask
;
858 lwkt_reltoken(&lp
->lwp_proc
->p_token
);
864 error
= doselect(uap
->nd
, uap
->in
, uap
->ou
, uap
->ex
, ktsp
,
865 &uap
->sysmsg_result
);
867 if (uap
->sigmask
!= NULL
) {
868 lwkt_gettoken(&lp
->lwp_proc
->p_token
);
869 /* doselect() responsible for turning ERESTART into EINTR */
870 KKASSERT(error
!= ERESTART
);
871 if (error
== EINTR
) {
873 * We can't restore the previous signal mask now
874 * because it could block the signal that interrupted
875 * us. So make a note to restore it after executing
878 lp
->lwp_flags
|= LWP_OLDMASK
;
881 * No handler to run. Restore previous mask immediately.
883 lp
->lwp_sigmask
= lp
->lwp_oldsigmask
;
885 lwkt_reltoken(&lp
->lwp_proc
->p_token
);
892 select_copyin(void *arg
, struct kevent
*kevp
, int maxevents
, int *events
)
894 struct select_kevent_copyin_args
*skap
= NULL
;
901 skap
= (struct select_kevent_copyin_args
*)arg
;
903 if (*events
== maxevents
)
906 while (skap
->active_set
< COPYIN_DONE
) {
907 switch (skap
->active_set
) {
910 * Register descriptors for the read filter
912 fdp
= skap
->read_set
;
913 filter
= EVFILT_READ
;
914 fflags
= NOTE_OLDAPI
;
922 * Register descriptors for the write filter
924 fdp
= skap
->write_set
;
925 filter
= EVFILT_WRITE
;
926 fflags
= NOTE_OLDAPI
;
934 * Register descriptors for the exception filter
936 fdp
= skap
->except_set
;
937 filter
= EVFILT_EXCEPT
;
938 fflags
= NOTE_OLDAPI
| NOTE_OOB
;
946 * Nothing left to register
952 while (skap
->proc_fds
< skap
->num_fds
) {
954 if (FD_ISSET(fd
, fdp
)) {
955 kev
= &kevp
[*events
];
956 EV_SET(kev
, fd
, filter
,
960 skap
->lwp
->lwp_kqueue_serial
);
965 kprintf("select fd %d filter %d "
966 "serial %ju\n", fd
, filter
,
968 skap
->lwp
->lwp_kqueue_serial
);
972 if (*events
== maxevents
)
983 select_copyout(void *arg
, struct kevent
*kevp
, int count
, int *res
)
985 struct select_kevent_copyin_args
*skap
;
990 skap
= (struct select_kevent_copyin_args
*)arg
;
992 for (i
= 0; i
< count
; ++i
) {
994 * Filter out and delete spurious events
996 if ((uint64_t)(uintptr_t)kevp
[i
].udata
!=
997 skap
->lwp
->lwp_kqueue_serial
) {
999 kev
.flags
= EV_DISABLE
|EV_DELETE
;
1001 kqueue_register(&skap
->lwp
->lwp_kqueue
, &kev
, &n
);
1003 kprintf("select fd %ju mismatched serial %ju\n",
1004 (uintmax_t)kevp
[i
].ident
,
1005 (uintmax_t)skap
->lwp
->lwp_kqueue_serial
);
1013 if (kevp
[i
].flags
& EV_ERROR
) {
1014 int error
= kevp
[i
].data
;
1019 * A bad file descriptor is considered a
1020 * fatal error for select, bail out.
1022 skap
->error
= error
;
1028 * Select silently swallows any unknown errors
1029 * for descriptors in the read or write sets.
1031 * ALWAYS filter out EOPNOTSUPP errors from
1032 * filters (at least until all filters support
1035 * We also filter out ENODEV since dev_dkqfilter
1036 * returns ENODEV if EOPNOTSUPP is returned in an
1041 if (kevp
[i
].filter
!= EVFILT_READ
&&
1042 kevp
[i
].filter
!= EVFILT_WRITE
&&
1043 error
!= EOPNOTSUPP
&&
1045 skap
->error
= error
;
1052 kprintf("select fd %ju filter %d error %d\n",
1053 (uintmax_t)kevp
[i
].ident
,
1054 kevp
[i
].filter
, error
);
1058 switch (kevp
[i
].filter
) {
1060 FD_SET(kevp
[i
].ident
, skap
->read_set
);
1063 FD_SET(kevp
[i
].ident
, skap
->write_set
);
1066 FD_SET(kevp
[i
].ident
, skap
->except_set
);
1077 * Copy select bits in from userland. Allocate kernel memory if the
1081 getbits(int bytes
, fd_set
*in_set
, kfd_set
**out_set
, kfd_set
*tmp_set
)
1086 if (bytes
< sizeof(*tmp_set
))
1089 *out_set
= kmalloc(bytes
, M_SELECT
, M_WAITOK
);
1090 error
= copyin(in_set
, *out_set
, bytes
);
1099 * Copy returned select bits back out to userland.
1102 putbits(int bytes
, kfd_set
*in_set
, fd_set
*out_set
)
1107 error
= copyout(in_set
, out_set
, bytes
);
1115 dotimeout_only(struct timespec
*ts
)
1117 return(nanosleep1(ts
, NULL
));
1121 * Common code for sys_select() and sys_pselect().
1123 * in, out and ex are userland pointers. ts must point to validated
1124 * kernel-side timeout value or NULL for infinite timeout. res must
1125 * point to syscall return value.
1128 doselect(int nd
, fd_set
*read
, fd_set
*write
, fd_set
*except
,
1129 struct timespec
*ts
, int *res
)
1131 struct proc
*p
= curproc
;
1132 struct select_kevent_copyin_args
*kap
, ka
;
1142 return (dotimeout_only(ts
));
1144 if (nd
> p
->p_fd
->fd_nfiles
) /* limit kmalloc */
1145 nd
= p
->p_fd
->fd_nfiles
;
1148 kap
->lwp
= curthread
->td_lwp
;
1152 kap
->active_set
= COPYIN_READ
;
1155 * Calculate bytes based on the number of __fd_mask[] array entries
1156 * multiplied by the size of __fd_mask.
1158 bytes
= howmany(nd
, __NFDBITS
) * sizeof(__fd_mask
);
1160 /* kap->read_set = NULL; not needed */
1161 kap
->write_set
= NULL
;
1162 kap
->except_set
= NULL
;
1164 error
= getbits(bytes
, read
, &kap
->read_set
, &read_tmp
);
1166 error
= getbits(bytes
, write
, &kap
->write_set
, &write_tmp
);
1168 error
= getbits(bytes
, except
, &kap
->except_set
, &except_tmp
);
1173 * NOTE: Make sure the max events passed to kern_kevent() is
1174 * effectively unlimited. (nd * 3) accomplishes this.
1176 * (*res) continues to increment as returned events are
1179 error
= kern_kevent(&kap
->lwp
->lwp_kqueue
, 0x7FFFFFFF, res
, kap
,
1180 select_copyin
, select_copyout
, ts
, 0);
1182 error
= putbits(bytes
, kap
->read_set
, read
);
1184 error
= putbits(bytes
, kap
->write_set
, write
);
1186 error
= putbits(bytes
, kap
->except_set
, except
);
1189 * An error from an individual event that should be passed
1190 * back to userland (EBADF)
1199 if (kap
->read_set
&& kap
->read_set
!= &read_tmp
)
1200 kfree(kap
->read_set
, M_SELECT
);
1201 if (kap
->write_set
&& kap
->write_set
!= &write_tmp
)
1202 kfree(kap
->write_set
, M_SELECT
);
1203 if (kap
->except_set
&& kap
->except_set
!= &except_tmp
)
1204 kfree(kap
->except_set
, M_SELECT
);
1206 kap
->lwp
->lwp_kqueue_serial
+= kap
->num_fds
;
1217 sys_poll(struct poll_args
*uap
)
1219 struct timespec ts
, *tsp
;
1222 if (uap
->timeout
!= INFTIM
) {
1223 if (uap
->timeout
< 0)
1225 ts
.tv_sec
= uap
->timeout
/ 1000;
1226 ts
.tv_nsec
= (uap
->timeout
% 1000) * 1000 * 1000;
1232 error
= dopoll(uap
->nfds
, uap
->fds
, tsp
, &uap
->sysmsg_result
, 0);
1238 * Ppoll system call.
1243 sys_ppoll(struct ppoll_args
*uap
)
1245 struct thread
*td
= curthread
;
1246 struct lwp
*lp
= td
->td_lwp
;
1247 struct timespec
*ktsp
, kts
;
1252 * Get timeout if any.
1254 if (uap
->ts
!= NULL
) {
1255 error
= copyin(uap
->ts
, &kts
, sizeof (kts
));
1264 * Install temporary signal mask if any provided.
1266 if (uap
->sigmask
!= NULL
) {
1267 error
= copyin(uap
->sigmask
, &sigmask
, sizeof(sigmask
));
1270 lwkt_gettoken(&lp
->lwp_proc
->p_token
);
1271 lp
->lwp_oldsigmask
= lp
->lwp_sigmask
;
1272 SIG_CANTMASK(sigmask
);
1273 lp
->lwp_sigmask
= sigmask
;
1274 lwkt_reltoken(&lp
->lwp_proc
->p_token
);
1277 error
= dopoll(uap
->nfds
, uap
->fds
, ktsp
, &uap
->sysmsg_result
,
1278 ktsp
!= NULL
? KEVENT_TIMEOUT_PRECISE
: 0);
1280 if (uap
->sigmask
!= NULL
) {
1281 lwkt_gettoken(&lp
->lwp_proc
->p_token
);
1282 /* dopoll() responsible for turning ERESTART into EINTR */
1283 KKASSERT(error
!= ERESTART
);
1284 if (error
== EINTR
) {
1286 * We can't restore the previous signal mask now
1287 * because it could block the signal that interrupted
1288 * us. So make a note to restore it after executing
1291 lp
->lwp_flags
|= LWP_OLDMASK
;
1294 * No handler to run. Restore previous mask immediately.
1296 lp
->lwp_sigmask
= lp
->lwp_oldsigmask
;
1298 lwkt_reltoken(&lp
->lwp_proc
->p_token
);
1305 poll_copyin(void *arg
, struct kevent
*kevp
, int maxevents
, int *events
)
1307 struct poll_kevent_copyin_args
*pkap
;
1312 pkap
= (struct poll_kevent_copyin_args
*)arg
;
1314 while (pkap
->pfds
< pkap
->nfds
) {
1315 pfd
= &pkap
->fds
[pkap
->pfds
];
1317 /* Clear return events */
1320 /* Do not check if fd is equal to -1 */
1321 if (pfd
->fd
== -1) {
1327 if (pfd
->events
& (POLLIN
| POLLRDNORM
))
1329 if (pfd
->events
& (POLLOUT
| POLLWRNORM
))
1331 if (pfd
->events
& (POLLPRI
| POLLRDBAND
))
1334 if (*events
+ kev_count
> maxevents
)
1338 * NOTE: A combined serial number and poll array index is
1339 * stored in kev->udata.
1341 kev
= &kevp
[*events
];
1342 if (pfd
->events
& (POLLIN
| POLLRDNORM
)) {
1343 EV_SET(kev
++, pfd
->fd
, EVFILT_READ
, EV_ADD
|EV_ENABLE
,
1344 NOTE_OLDAPI
, 0, (void *)(uintptr_t)
1345 (pkap
->lwp
->lwp_kqueue_serial
+ pkap
->pfds
));
1347 if (pfd
->events
& (POLLOUT
| POLLWRNORM
)) {
1348 EV_SET(kev
++, pfd
->fd
, EVFILT_WRITE
, EV_ADD
|EV_ENABLE
,
1349 NOTE_OLDAPI
, 0, (void *)(uintptr_t)
1350 (pkap
->lwp
->lwp_kqueue_serial
+ pkap
->pfds
));
1352 if (pfd
->events
& (POLLPRI
| POLLRDBAND
)) {
1353 EV_SET(kev
++, pfd
->fd
, EVFILT_EXCEPT
, EV_ADD
|EV_ENABLE
,
1354 NOTE_OLDAPI
| NOTE_OOB
, 0,
1356 (pkap
->lwp
->lwp_kqueue_serial
+ pkap
->pfds
));
1360 kprintf("poll index %d/%d fd %d events %08x "
1361 "serial %ju\n", pkap
->pfds
, pkap
->nfds
-1,
1362 pfd
->fd
, pfd
->events
,
1363 (uintmax_t)pkap
->lwp
->lwp_kqueue_serial
);
1367 (*events
) += kev_count
;
1374 poll_copyout(void *arg
, struct kevent
*kevp
, int count
, int *res
)
1376 struct poll_kevent_copyin_args
*pkap
;
1384 pkap
= (struct poll_kevent_copyin_args
*)arg
;
1386 for (i
= 0; i
< count
; ++i
) {
1388 * Extract the poll array index and delete spurious events.
1389 * We can easily tell if the serial number is incorrect
1390 * by checking whether the extracted index is out of range.
1392 pi
= (uint64_t)(uintptr_t)kevp
[i
].udata
-
1393 pkap
->lwp
->lwp_kqueue_serial
;
1395 if (pi
>= pkap
->nfds
) {
1397 kev
.flags
= EV_DISABLE
|EV_DELETE
;
1399 kqueue_register(&pkap
->lwp
->lwp_kqueue
, &kev
, &n
);
1401 kprintf("poll index %ju out of range against "
1402 "serial %ju\n", (uintmax_t)pi
,
1403 (uintmax_t)pkap
->lwp
->lwp_kqueue_serial
);
1407 pfd
= &pkap
->fds
[pi
];
1408 if (kevp
[i
].ident
== pfd
->fd
) {
1410 * A single descriptor may generate an error against
1411 * more than one filter, make sure to set the
1412 * appropriate flags but do not increment (*res)
1415 count_res
= (pfd
->revents
== 0);
1416 if (kevp
[i
].flags
& EV_ERROR
) {
1417 switch(kevp
[i
].data
) {
1420 /* Bad file descriptor */
1423 pfd
->revents
|= POLLNVAL
;
1427 * Poll silently swallows any unknown
1428 * errors except in the case of POLLPRI
1429 * (OOB/urgent data).
1431 * ALWAYS filter out EOPNOTSUPP errors
1432 * from filters, common applications
1433 * set POLLPRI|POLLRDBAND and most
1434 * filters do not support EVFILT_EXCEPT.
1436 * We also filter out ENODEV since dev_dkqfilter
1437 * returns ENODEV if EOPNOTSUPP is returned in an
1442 if (kevp
[i
].filter
!= EVFILT_READ
&&
1443 kevp
[i
].filter
!= EVFILT_WRITE
&&
1444 kevp
[i
].data
!= EOPNOTSUPP
&&
1445 kevp
[i
].data
!= ENODEV
) {
1448 pfd
->revents
|= POLLERR
;
1453 kprintf("poll index %ju fd %d "
1454 "filter %d error %jd\n",
1455 (uintmax_t)pi
, pfd
->fd
,
1457 (intmax_t)kevp
[i
].data
);
1462 switch (kevp
[i
].filter
) {
1465 * NODATA on the read side can indicate a
1466 * half-closed situation and not necessarily
1467 * a disconnect, so depend on the user
1468 * issuing a read() and getting 0 bytes back.
1470 * If EV_HUP is set the peer completely
1471 * disconnected and we can set POLLHUP
1472 * once data is exhausted.
1474 if (kevp
[i
].flags
& EV_NODATA
) {
1475 if (kevp
[i
].flags
& EV_HUP
)
1476 pfd
->revents
|= POLLHUP
;
1478 if ((kevp
[i
].flags
& EV_EOF
) &&
1479 kevp
[i
].fflags
!= 0)
1480 pfd
->revents
|= POLLERR
;
1481 if (pfd
->events
& POLLIN
)
1482 pfd
->revents
|= POLLIN
;
1483 if (pfd
->events
& POLLRDNORM
)
1484 pfd
->revents
|= POLLRDNORM
;
1488 * As per the OpenGroup POLLHUP is mutually
1489 * exclusive with the writability flags. I
1490 * consider this a bit broken but...
1492 * In this case a disconnect is implied even
1493 * for a half-closed (write side) situation.
1495 if (kevp
[i
].flags
& EV_EOF
) {
1496 pfd
->revents
|= POLLHUP
;
1497 if (kevp
[i
].fflags
!= 0)
1498 pfd
->revents
|= POLLERR
;
1500 if (pfd
->events
& POLLOUT
)
1501 pfd
->revents
|= POLLOUT
;
1502 if (pfd
->events
& POLLWRNORM
)
1503 pfd
->revents
|= POLLWRNORM
;
1508 * EV_NODATA should never be tagged for this
1511 if (pfd
->events
& POLLPRI
)
1512 pfd
->revents
|= POLLPRI
;
1513 if (pfd
->events
& POLLRDBAND
)
1514 pfd
->revents
|= POLLRDBAND
;
1519 kprintf("poll index %ju/%d fd %d "
1520 "revents %08x\n", (uintmax_t)pi
, pkap
->nfds
,
1521 pfd
->fd
, pfd
->revents
);
1524 if (count_res
&& pfd
->revents
)
1528 kprintf("poll index %ju mismatch %ju/%d\n",
1529 (uintmax_t)pi
, (uintmax_t)kevp
[i
].ident
,
1539 dopoll(int nfds
, struct pollfd
*fds
, struct timespec
*ts
, int *res
, int flags
)
1541 struct poll_kevent_copyin_args ka
;
1542 struct pollfd sfds
[64];
1550 if (nfds
== 0 && ts
)
1551 return (dotimeout_only(ts
));
1554 * This is a bit arbitrary but we need to limit internal kmallocs.
1556 if (nfds
> maxfilesperproc
* 2)
1557 nfds
= maxfilesperproc
* 2;
1558 bytes
= sizeof(struct pollfd
) * nfds
;
1560 ka
.lwp
= curthread
->td_lwp
;
1568 ka
.fds
= kmalloc(bytes
, M_SELECT
, M_WAITOK
);
1570 error
= copyin(fds
, ka
.fds
, bytes
);
1572 error
= kern_kevent(&ka
.lwp
->lwp_kqueue
, 0x7FFFFFFF, res
, &ka
,
1573 poll_copyin
, poll_copyout
, ts
, flags
);
1576 error
= copyout(ka
.fds
, fds
, bytes
);
1579 kfree(ka
.fds
, M_SELECT
);
1581 ka
.lwp
->lwp_kqueue_serial
+= nfds
;
1587 socket_wait_copyin(void *arg
, struct kevent
*kevp
, int maxevents
, int *events
)
1593 socket_wait_copyout(void *arg
, struct kevent
*kevp
, int count
, int *res
)
1599 extern struct fileops socketops
;
1602 * NOTE: Callers of socket_wait() must already have a reference on the
1606 socket_wait(struct socket
*so
, struct timespec
*ts
, int *res
)
1608 struct thread
*td
= curthread
;
1615 if ((error
= falloc(td
->td_lwp
, &fp
, &fd
)) != 0)
1618 fp
->f_type
= DTYPE_SOCKET
;
1619 fp
->f_flag
= FREAD
| FWRITE
;
1620 fp
->f_ops
= &socketops
;
1622 fsetfd(td
->td_lwp
->lwp_proc
->p_fd
, fp
, fd
);
1623 fsetfdflags(td
->td_proc
->p_fd
, fd
, UF_EXCLOSE
);
1625 bzero(&kq
, sizeof(kq
));
1626 kqueue_init(&kq
, td
->td_lwp
->lwp_proc
->p_fd
);
1627 EV_SET(&kev
, fd
, EVFILT_READ
, EV_ADD
|EV_ENABLE
, 0, 0, NULL
);
1629 if ((error
= kqueue_register(&kq
, &kev
, &n
)) != 0) {
1634 error
= kern_kevent(&kq
, 1, res
, NULL
, socket_wait_copyin
,
1635 socket_wait_copyout
, ts
, 0);
1637 EV_SET(&kev
, fd
, EVFILT_READ
, EV_DELETE
|EV_DISABLE
, 0, 0, NULL
);
1639 kqueue_register(&kq
, &kev
, &n
);
1640 fp
->f_ops
= &badfileops
;
1647 * OpenBSD poll system call.
1648 * XXX this isn't quite a true representation.. OpenBSD uses select ops.
1653 sys_openbsd_poll(struct openbsd_poll_args
*uap
)
1655 return (sys_poll((struct poll_args
*)uap
));
1660 seltrue(cdev_t dev
, int events
)
1662 return (events
& (POLLIN
| POLLOUT
| POLLRDNORM
| POLLWRNORM
));