2 * Copyright (c) 1982, 1986, 1989, 1993
3 * The Regents of the University of California. All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 * must display the following acknowledgement:
20 * This product includes software developed by the University of
21 * California, Berkeley and its contributors.
22 * 4. Neither the name of the University nor the names of its contributors
23 * may be used to endorse or promote products derived from this software
24 * without specific prior written permission.
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38 * @(#)sys_generic.c 8.5 (Berkeley) 1/21/94
39 * $FreeBSD: src/sys/kern/sys_generic.c,v 1.55.2.10 2001/03/17 10:39:32 peter Exp $
40 * $DragonFly: src/sys/kern/sys_generic.c,v 1.49 2008/05/05 22:09:44 dillon Exp $
43 #include "opt_ktrace.h"
45 #include <sys/param.h>
46 #include <sys/systm.h>
47 #include <sys/sysproto.h>
48 #include <sys/event.h>
49 #include <sys/filedesc.h>
50 #include <sys/filio.h>
51 #include <sys/fcntl.h>
54 #include <sys/signalvar.h>
55 #include <sys/socketvar.h>
57 #include <sys/kernel.h>
58 #include <sys/kern_syscall.h>
59 #include <sys/malloc.h>
60 #include <sys/mapped_ioctl.h>
62 #include <sys/queue.h>
63 #include <sys/resourcevar.h>
64 #include <sys/socketops.h>
65 #include <sys/sysctl.h>
66 #include <sys/sysent.h>
69 #include <sys/ktrace.h>
72 #include <vm/vm_page.h>
74 #include <sys/file2.h>
75 #include <sys/mplock2.h>
76 #include <sys/spinlock2.h>
78 #include <machine/limits.h>
80 static MALLOC_DEFINE(M_IOCTLOPS
, "ioctlops", "ioctl data buffer");
81 static MALLOC_DEFINE(M_IOCTLMAP
, "ioctlmap", "mapped ioctl handler buffer");
82 static MALLOC_DEFINE(M_SELECT
, "select", "select() buffer");
83 MALLOC_DEFINE(M_IOV
, "iov", "large iov's");
85 typedef struct kfd_set
{
89 enum select_copyin_states
{
90 COPYIN_READ
, COPYIN_WRITE
, COPYIN_EXCEPT
, COPYIN_DONE
};
92 struct select_kevent_copyin_args
{
96 int active_set
; /* One of select_copyin_states */
97 struct lwp
*lwp
; /* Pointer to our lwp */
98 int num_fds
; /* Number of file descriptors (syscall arg) */
99 int proc_fds
; /* Processed fd's (wraps) */
100 int error
; /* Returned to userland */
103 struct poll_kevent_copyin_args
{
111 static struct lwkt_token mioctl_token
= LWKT_TOKEN_INITIALIZER(mioctl_token
);
113 static int doselect(int nd
, fd_set
*in
, fd_set
*ou
, fd_set
*ex
,
114 struct timespec
*ts
, int *res
);
115 static int dopoll(int nfds
, struct pollfd
*fds
, struct timespec
*ts
,
117 static int dofileread(int, struct file
*, struct uio
*, int, size_t *);
118 static int dofilewrite(int, struct file
*, struct uio
*, int, size_t *);
126 sys_read(struct read_args
*uap
)
128 struct thread
*td
= curthread
;
133 if ((ssize_t
)uap
->nbyte
< 0)
136 aiov
.iov_base
= uap
->buf
;
137 aiov
.iov_len
= uap
->nbyte
;
138 auio
.uio_iov
= &aiov
;
140 auio
.uio_offset
= -1;
141 auio
.uio_resid
= uap
->nbyte
;
142 auio
.uio_rw
= UIO_READ
;
143 auio
.uio_segflg
= UIO_USERSPACE
;
146 error
= kern_preadv(uap
->fd
, &auio
, 0, &uap
->sysmsg_szresult
);
151 * Positioned (Pread) read system call
156 sys_extpread(struct extpread_args
*uap
)
158 struct thread
*td
= curthread
;
164 if ((ssize_t
)uap
->nbyte
< 0)
167 aiov
.iov_base
= uap
->buf
;
168 aiov
.iov_len
= uap
->nbyte
;
169 auio
.uio_iov
= &aiov
;
171 auio
.uio_offset
= uap
->offset
;
172 auio
.uio_resid
= uap
->nbyte
;
173 auio
.uio_rw
= UIO_READ
;
174 auio
.uio_segflg
= UIO_USERSPACE
;
177 flags
= uap
->flags
& O_FMASK
;
178 if (uap
->offset
!= (off_t
)-1)
181 error
= kern_preadv(uap
->fd
, &auio
, flags
, &uap
->sysmsg_szresult
);
186 * Scatter read system call.
191 sys_readv(struct readv_args
*uap
)
193 struct thread
*td
= curthread
;
195 struct iovec aiov
[UIO_SMALLIOV
], *iov
= NULL
;
198 error
= iovec_copyin(uap
->iovp
, &iov
, aiov
, uap
->iovcnt
,
203 auio
.uio_iovcnt
= uap
->iovcnt
;
204 auio
.uio_offset
= -1;
205 auio
.uio_rw
= UIO_READ
;
206 auio
.uio_segflg
= UIO_USERSPACE
;
209 error
= kern_preadv(uap
->fd
, &auio
, 0, &uap
->sysmsg_szresult
);
211 iovec_free(&iov
, aiov
);
217 * Scatter positioned read system call.
222 sys_extpreadv(struct extpreadv_args
*uap
)
224 struct thread
*td
= curthread
;
226 struct iovec aiov
[UIO_SMALLIOV
], *iov
= NULL
;
230 error
= iovec_copyin(uap
->iovp
, &iov
, aiov
, uap
->iovcnt
,
235 auio
.uio_iovcnt
= uap
->iovcnt
;
236 auio
.uio_offset
= uap
->offset
;
237 auio
.uio_rw
= UIO_READ
;
238 auio
.uio_segflg
= UIO_USERSPACE
;
241 flags
= uap
->flags
& O_FMASK
;
242 if (uap
->offset
!= (off_t
)-1)
245 error
= kern_preadv(uap
->fd
, &auio
, flags
, &uap
->sysmsg_szresult
);
247 iovec_free(&iov
, aiov
);
255 kern_preadv(int fd
, struct uio
*auio
, int flags
, size_t *res
)
257 struct thread
*td
= curthread
;
258 struct proc
*p
= td
->td_proc
;
264 fp
= holdfp(p
->p_fd
, fd
, FREAD
);
267 if (flags
& O_FOFFSET
&& fp
->f_type
!= DTYPE_VNODE
) {
270 error
= dofileread(fd
, fp
, auio
, flags
, res
);
277 * Common code for readv and preadv that reads data in
278 * from a file using the passed in uio, offset, and flags.
280 * MPALMOSTSAFE - ktrace needs help
283 dofileread(int fd
, struct file
*fp
, struct uio
*auio
, int flags
, size_t *res
)
288 struct thread
*td
= curthread
;
289 struct iovec
*ktriov
= NULL
;
295 * if tracing, save a copy of iovec
297 if (KTRPOINT(td
, KTR_GENIO
)) {
298 int iovlen
= auio
->uio_iovcnt
* sizeof(struct iovec
);
300 MALLOC(ktriov
, struct iovec
*, iovlen
, M_TEMP
, M_WAITOK
);
301 bcopy((caddr_t
)auio
->uio_iov
, (caddr_t
)ktriov
, iovlen
);
305 len
= auio
->uio_resid
;
306 error
= fo_read(fp
, auio
, fp
->f_cred
, flags
);
308 if (auio
->uio_resid
!= len
&& (error
== ERESTART
||
309 error
== EINTR
|| error
== EWOULDBLOCK
))
313 if (ktriov
!= NULL
) {
315 ktruio
.uio_iov
= ktriov
;
316 ktruio
.uio_resid
= len
- auio
->uio_resid
;
318 ktrgenio(td
->td_lwp
, fd
, UIO_READ
, &ktruio
, error
);
321 FREE(ktriov
, M_TEMP
);
325 *res
= len
- auio
->uio_resid
;
336 sys_write(struct write_args
*uap
)
338 struct thread
*td
= curthread
;
343 if ((ssize_t
)uap
->nbyte
< 0)
346 aiov
.iov_base
= (void *)(uintptr_t)uap
->buf
;
347 aiov
.iov_len
= uap
->nbyte
;
348 auio
.uio_iov
= &aiov
;
350 auio
.uio_offset
= -1;
351 auio
.uio_resid
= uap
->nbyte
;
352 auio
.uio_rw
= UIO_WRITE
;
353 auio
.uio_segflg
= UIO_USERSPACE
;
356 error
= kern_pwritev(uap
->fd
, &auio
, 0, &uap
->sysmsg_szresult
);
367 sys_extpwrite(struct extpwrite_args
*uap
)
369 struct thread
*td
= curthread
;
375 if ((ssize_t
)uap
->nbyte
< 0)
378 aiov
.iov_base
= (void *)(uintptr_t)uap
->buf
;
379 aiov
.iov_len
= uap
->nbyte
;
380 auio
.uio_iov
= &aiov
;
382 auio
.uio_offset
= uap
->offset
;
383 auio
.uio_resid
= uap
->nbyte
;
384 auio
.uio_rw
= UIO_WRITE
;
385 auio
.uio_segflg
= UIO_USERSPACE
;
388 flags
= uap
->flags
& O_FMASK
;
389 if (uap
->offset
!= (off_t
)-1)
391 error
= kern_pwritev(uap
->fd
, &auio
, flags
, &uap
->sysmsg_szresult
);
399 sys_writev(struct writev_args
*uap
)
401 struct thread
*td
= curthread
;
403 struct iovec aiov
[UIO_SMALLIOV
], *iov
= NULL
;
406 error
= iovec_copyin(uap
->iovp
, &iov
, aiov
, uap
->iovcnt
,
411 auio
.uio_iovcnt
= uap
->iovcnt
;
412 auio
.uio_offset
= -1;
413 auio
.uio_rw
= UIO_WRITE
;
414 auio
.uio_segflg
= UIO_USERSPACE
;
417 error
= kern_pwritev(uap
->fd
, &auio
, 0, &uap
->sysmsg_szresult
);
419 iovec_free(&iov
, aiov
);
425 * Gather positioned write system call
430 sys_extpwritev(struct extpwritev_args
*uap
)
432 struct thread
*td
= curthread
;
434 struct iovec aiov
[UIO_SMALLIOV
], *iov
= NULL
;
438 error
= iovec_copyin(uap
->iovp
, &iov
, aiov
, uap
->iovcnt
,
443 auio
.uio_iovcnt
= uap
->iovcnt
;
444 auio
.uio_offset
= uap
->offset
;
445 auio
.uio_rw
= UIO_WRITE
;
446 auio
.uio_segflg
= UIO_USERSPACE
;
449 flags
= uap
->flags
& O_FMASK
;
450 if (uap
->offset
!= (off_t
)-1)
453 error
= kern_pwritev(uap
->fd
, &auio
, flags
, &uap
->sysmsg_szresult
);
455 iovec_free(&iov
, aiov
);
463 kern_pwritev(int fd
, struct uio
*auio
, int flags
, size_t *res
)
465 struct thread
*td
= curthread
;
466 struct proc
*p
= td
->td_proc
;
472 fp
= holdfp(p
->p_fd
, fd
, FWRITE
);
475 else if ((flags
& O_FOFFSET
) && fp
->f_type
!= DTYPE_VNODE
) {
478 error
= dofilewrite(fd
, fp
, auio
, flags
, res
);
486 * Common code for writev and pwritev that writes data to
487 * a file using the passed in uio, offset, and flags.
489 * MPALMOSTSAFE - ktrace needs help
492 dofilewrite(int fd
, struct file
*fp
, struct uio
*auio
, int flags
, size_t *res
)
494 struct thread
*td
= curthread
;
495 struct lwp
*lp
= td
->td_lwp
;
499 struct iovec
*ktriov
= NULL
;
505 * if tracing, save a copy of iovec and uio
507 if (KTRPOINT(td
, KTR_GENIO
)) {
508 int iovlen
= auio
->uio_iovcnt
* sizeof(struct iovec
);
510 MALLOC(ktriov
, struct iovec
*, iovlen
, M_TEMP
, M_WAITOK
);
511 bcopy((caddr_t
)auio
->uio_iov
, (caddr_t
)ktriov
, iovlen
);
515 len
= auio
->uio_resid
;
516 error
= fo_write(fp
, auio
, fp
->f_cred
, flags
);
518 if (auio
->uio_resid
!= len
&& (error
== ERESTART
||
519 error
== EINTR
|| error
== EWOULDBLOCK
))
521 /* Socket layer is responsible for issuing SIGPIPE. */
522 if (error
== EPIPE
) {
524 lwpsignal(lp
->lwp_proc
, lp
, SIGPIPE
);
529 if (ktriov
!= NULL
) {
531 ktruio
.uio_iov
= ktriov
;
532 ktruio
.uio_resid
= len
- auio
->uio_resid
;
534 ktrgenio(lp
, fd
, UIO_WRITE
, &ktruio
, error
);
537 FREE(ktriov
, M_TEMP
);
541 *res
= len
- auio
->uio_resid
;
552 sys_ioctl(struct ioctl_args
*uap
)
556 error
= mapped_ioctl(uap
->fd
, uap
->com
, uap
->data
, NULL
, &uap
->sysmsg
);
560 struct ioctl_map_entry
{
562 struct ioctl_map_range
*cmd_ranges
;
563 LIST_ENTRY(ioctl_map_entry
) entries
;
567 * The true heart of all ioctl syscall handlers (native, emulation).
568 * If map != NULL, it will be searched for a matching entry for com,
569 * and appropriate conversions/conversion functions will be utilized.
574 mapped_ioctl(int fd
, u_long com
, caddr_t uspc_data
, struct ioctl_map
*map
,
577 struct thread
*td
= curthread
;
578 struct proc
*p
= td
->td_proc
;
581 struct ioctl_map_range
*iomc
= NULL
;
587 #define STK_PARAMS 128
589 char stkbuf
[STK_PARAMS
];
596 fp
= holdfp(p
->p_fd
, fd
, FREAD
|FWRITE
);
600 if (map
!= NULL
) { /* obey translation map */
602 struct ioctl_map_entry
*e
;
604 maskcmd
= com
& map
->mask
;
606 lwkt_gettoken(&mioctl_token
);
607 LIST_FOREACH(e
, &map
->mapping
, entries
) {
608 for (iomc
= e
->cmd_ranges
; iomc
->start
!= 0 ||
609 iomc
->maptocmd
!= 0 || iomc
->wrapfunc
!= NULL
||
610 iomc
->mapfunc
!= NULL
;
612 if (maskcmd
>= iomc
->start
&&
613 maskcmd
<= iomc
->end
)
617 /* Did we find a match? */
618 if (iomc
->start
!= 0 || iomc
->maptocmd
!= 0 ||
619 iomc
->wrapfunc
!= NULL
|| iomc
->mapfunc
!= NULL
)
622 lwkt_reltoken(&mioctl_token
);
625 (iomc
->start
== 0 && iomc
->maptocmd
== 0
626 && iomc
->wrapfunc
== NULL
&& iomc
->mapfunc
== NULL
)) {
627 kprintf("%s: 'ioctl' fd=%d, cmd=0x%lx ('%c',%d) not implemented\n",
628 map
->sys
, fd
, maskcmd
,
629 (int)((maskcmd
>> 8) & 0xff),
630 (int)(maskcmd
& 0xff));
636 * If it's a non-range one to one mapping, maptocmd should be
637 * correct. If it's a ranged one to one mapping, we pass the
638 * original value of com, and for a range mapped to a different
639 * range, we always need a mapping function to translate the
640 * ioctl to our native ioctl. Ex. 6500-65ff <-> 9500-95ff
642 if (iomc
->start
== iomc
->end
&& iomc
->maptocmd
== iomc
->maptoend
) {
643 com
= iomc
->maptocmd
;
644 } else if (iomc
->start
== iomc
->maptocmd
&& iomc
->end
== iomc
->maptoend
) {
645 if (iomc
->mapfunc
!= NULL
)
646 com
= iomc
->mapfunc(iomc
->start
, iomc
->end
,
647 iomc
->start
, iomc
->end
,
650 if (iomc
->mapfunc
!= NULL
) {
651 com
= iomc
->mapfunc(iomc
->start
, iomc
->end
,
652 iomc
->maptocmd
, iomc
->maptoend
,
655 kprintf("%s: Invalid mapping for fd=%d, cmd=%#lx ('%c',%d)\n",
656 map
->sys
, fd
, maskcmd
,
657 (int)((maskcmd
>> 8) & 0xff),
658 (int)(maskcmd
& 0xff));
667 error
= fclrfdflags(p
->p_fd
, fd
, UF_EXCLOSE
);
670 error
= fsetfdflags(p
->p_fd
, fd
, UF_EXCLOSE
);
675 * Interpret high order word to find amount of data to be
676 * copied to/from the user's address space.
678 size
= IOCPARM_LEN(com
);
679 if (size
> IOCPARM_MAX
) {
684 if (size
> sizeof (ubuf
.stkbuf
)) {
685 memp
= kmalloc(size
, M_IOCTLOPS
, M_WAITOK
);
691 if ((com
& IOC_IN
) != 0) {
693 error
= copyin(uspc_data
, data
, (size_t)size
);
696 kfree(memp
, M_IOCTLOPS
);
700 *(caddr_t
*)data
= uspc_data
;
702 } else if ((com
& IOC_OUT
) != 0 && size
) {
704 * Zero the buffer so the user always
705 * gets back something deterministic.
707 bzero(data
, (size_t)size
);
708 } else if ((com
& IOC_VOID
) != 0) {
709 *(caddr_t
*)data
= uspc_data
;
714 if ((tmp
= *(int *)data
))
715 atomic_set_int(&fp
->f_flag
, FNONBLOCK
);
717 atomic_clear_int(&fp
->f_flag
, FNONBLOCK
);
722 if ((tmp
= *(int *)data
))
723 atomic_set_int(&fp
->f_flag
, FASYNC
);
725 atomic_clear_int(&fp
->f_flag
, FASYNC
);
726 error
= fo_ioctl(fp
, FIOASYNC
, (caddr_t
)&tmp
, cred
, msg
);
731 * If there is a override function,
732 * call it instead of directly routing the call
734 if (map
!= NULL
&& iomc
->wrapfunc
!= NULL
)
735 error
= iomc
->wrapfunc(fp
, com
, ocom
, data
, cred
);
737 error
= fo_ioctl(fp
, com
, data
, cred
, msg
);
739 * Copy any data to user, size was
740 * already set and checked above.
742 if (error
== 0 && (com
& IOC_OUT
) != 0 && size
!= 0)
743 error
= copyout(data
, uspc_data
, (size_t)size
);
747 kfree(memp
, M_IOCTLOPS
);
757 mapped_ioctl_register_handler(struct ioctl_map_handler
*he
)
759 struct ioctl_map_entry
*ne
;
761 KKASSERT(he
!= NULL
&& he
->map
!= NULL
&& he
->cmd_ranges
!= NULL
&&
762 he
->subsys
!= NULL
&& *he
->subsys
!= '\0');
764 ne
= kmalloc(sizeof(struct ioctl_map_entry
), M_IOCTLMAP
,
767 ne
->subsys
= he
->subsys
;
768 ne
->cmd_ranges
= he
->cmd_ranges
;
770 lwkt_gettoken(&mioctl_token
);
771 LIST_INSERT_HEAD(&he
->map
->mapping
, ne
, entries
);
772 lwkt_reltoken(&mioctl_token
);
781 mapped_ioctl_unregister_handler(struct ioctl_map_handler
*he
)
783 struct ioctl_map_entry
*ne
;
786 KKASSERT(he
!= NULL
&& he
->map
!= NULL
&& he
->cmd_ranges
!= NULL
);
788 lwkt_gettoken(&mioctl_token
);
789 LIST_FOREACH(ne
, &he
->map
->mapping
, entries
) {
790 if (ne
->cmd_ranges
== he
->cmd_ranges
) {
791 LIST_REMOVE(ne
, entries
);
792 kfree(ne
, M_IOCTLMAP
);
797 lwkt_reltoken(&mioctl_token
);
801 static int nselcoll
; /* Select collisions since boot */
803 SYSCTL_INT(_kern
, OID_AUTO
, nselcoll
, CTLFLAG_RD
, &nselcoll
, 0, "");
804 static int nseldebug
;
805 SYSCTL_INT(_kern
, OID_AUTO
, nseldebug
, CTLFLAG_RW
, &nseldebug
, 0, "");
808 * Select system call.
813 sys_select(struct select_args
*uap
)
816 struct timespec
*ktsp
, kts
;
820 * Get timeout if any.
822 if (uap
->tv
!= NULL
) {
823 error
= copyin(uap
->tv
, &ktv
, sizeof (ktv
));
826 TIMEVAL_TO_TIMESPEC(&ktv
, &kts
);
835 error
= doselect(uap
->nd
, uap
->in
, uap
->ou
, uap
->ex
, ktsp
,
836 &uap
->sysmsg_result
);
843 * Pselect system call.
848 sys_pselect(struct pselect_args
*uap
)
850 struct thread
*td
= curthread
;
851 struct lwp
*lp
= td
->td_lwp
;
852 struct timespec
*ktsp
, kts
;
857 * Get timeout if any.
859 if (uap
->ts
!= NULL
) {
860 error
= copyin(uap
->ts
, &kts
, sizeof (kts
));
869 * Install temporary signal mask if any provided.
871 if (uap
->sigmask
!= NULL
) {
872 error
= copyin(uap
->sigmask
, &sigmask
, sizeof(sigmask
));
876 lp
->lwp_oldsigmask
= lp
->lwp_sigmask
;
877 SIG_CANTMASK(sigmask
);
878 lp
->lwp_sigmask
= sigmask
;
886 error
= doselect(uap
->nd
, uap
->in
, uap
->ou
, uap
->ex
, ktsp
,
887 &uap
->sysmsg_result
);
889 if (uap
->sigmask
!= NULL
) {
890 /* doselect() responsible for turning ERESTART into EINTR */
891 KKASSERT(error
!= ERESTART
);
892 if (error
== EINTR
) {
894 * We can't restore the previous signal mask now
895 * because it could block the signal that interrupted
896 * us. So make a note to restore it after executing
899 lp
->lwp_flag
|= LWP_OLDMASK
;
902 * No handler to run. Restore previous mask immediately.
904 lp
->lwp_sigmask
= lp
->lwp_oldsigmask
;
913 select_copyin(void *arg
, struct kevent
*kevp
, int maxevents
, int *events
)
915 struct select_kevent_copyin_args
*skap
= NULL
;
922 skap
= (struct select_kevent_copyin_args
*)arg
;
924 if (*events
== maxevents
)
927 while (skap
->active_set
< COPYIN_DONE
) {
928 switch (skap
->active_set
) {
931 * Register descriptors for the read filter
933 fdp
= skap
->read_set
;
934 filter
= EVFILT_READ
;
935 fflags
= NOTE_OLDAPI
;
943 * Register descriptors for the write filter
945 fdp
= skap
->write_set
;
946 filter
= EVFILT_WRITE
;
947 fflags
= NOTE_OLDAPI
;
955 * Register descriptors for the exception filter
957 fdp
= skap
->except_set
;
958 filter
= EVFILT_EXCEPT
;
959 fflags
= NOTE_OLDAPI
| NOTE_OOB
;
967 * Nothing left to register
973 while (skap
->proc_fds
< skap
->num_fds
) {
975 if (FD_ISSET(fd
, fdp
)) {
976 kev
= &kevp
[*events
];
977 EV_SET(kev
, fd
, filter
,
981 skap
->lwp
->lwp_kqueue_serial
);
986 kprintf("select fd %d filter %d serial %d\n",
987 fd
, filter
, skap
->lwp
->lwp_kqueue_serial
);
990 if (*events
== maxevents
)
1001 select_copyout(void *arg
, struct kevent
*kevp
, int count
, int *res
)
1003 struct select_kevent_copyin_args
*skap
;
1007 skap
= (struct select_kevent_copyin_args
*)arg
;
1009 for (i
= 0; i
< count
; ++i
) {
1011 * Filter out and delete spurious events
1013 if ((u_int
)(uintptr_t)kevp
[i
].udata
!=
1014 skap
->lwp
->lwp_kqueue_serial
) {
1016 kev
.flags
= EV_DISABLE
|EV_DELETE
;
1017 kqueue_register(&skap
->lwp
->lwp_kqueue
, &kev
);
1019 kprintf("select fd %ju mismatched serial %d\n",
1020 (uintmax_t)kevp
[i
].ident
,
1021 skap
->lwp
->lwp_kqueue_serial
);
1028 if (kevp
[i
].flags
& EV_ERROR
) {
1029 switch(kevp
[i
].data
) {
1032 * A bad file descriptor is considered a
1033 * fatal error for select, bail out.
1035 skap
->error
= EBADF
;
1041 * Select silently swallows any unknown errors
1042 * for descriptors in the read or write sets.
1044 * ALWAYS filter out EOPNOTSUPP errors from
1045 * filters (at least until all filters support
1048 if (kevp
[i
].filter
!= EVFILT_READ
&&
1049 kevp
[i
].filter
!= EVFILT_WRITE
&&
1050 kevp
[i
].data
!= EOPNOTSUPP
) {
1051 skap
->error
= kevp
[i
].data
;
1058 kprintf("select fd %ju filter %d error %jd\n",
1059 (uintmax_t)kevp
[i
].ident
,
1061 (intmax_t)kevp
[i
].data
);
1065 switch (kevp
[i
].filter
) {
1067 FD_SET(kevp
[i
].ident
, skap
->read_set
);
1070 FD_SET(kevp
[i
].ident
, skap
->write_set
);
1073 FD_SET(kevp
[i
].ident
, skap
->except_set
);
1084 * Copy select bits in from userland. Allocate kernel memory if the
1088 getbits(int bytes
, fd_set
*in_set
, kfd_set
**out_set
, kfd_set
*tmp_set
)
1093 if (bytes
< sizeof(*tmp_set
))
1096 *out_set
= kmalloc(bytes
, M_SELECT
, M_WAITOK
);
1097 error
= copyin(in_set
, *out_set
, bytes
);
1106 * Copy returned select bits back out to userland.
1109 putbits(int bytes
, kfd_set
*in_set
, fd_set
*out_set
)
1114 error
= copyout(in_set
, out_set
, bytes
);
1122 * Common code for sys_select() and sys_pselect().
1124 * in, out and ex are userland pointers. ts must point to validated
1125 * kernel-side timeout value or NULL for infinite timeout. res must
1126 * point to syscall return value.
1129 doselect(int nd
, fd_set
*read
, fd_set
*write
, fd_set
*except
,
1130 struct timespec
*ts
, int *res
)
1132 struct proc
*p
= curproc
;
1133 struct select_kevent_copyin_args
*kap
, ka
;
1142 if (nd
> p
->p_fd
->fd_nfiles
) /* limit kmalloc */
1143 nd
= p
->p_fd
->fd_nfiles
;
1146 kap
->lwp
= curthread
->td_lwp
;
1150 kap
->active_set
= COPYIN_READ
;
1153 * Calculate bytes based on the number of __fd_mask[] array entries
1154 * multiplied by the size of __fd_mask.
1156 bytes
= howmany(nd
, __NFDBITS
) * sizeof(__fd_mask
);
1158 /* kap->read_set = NULL; not needed */
1159 kap
->write_set
= NULL
;
1160 kap
->except_set
= NULL
;
1162 error
= getbits(bytes
, read
, &kap
->read_set
, &read_tmp
);
1164 error
= getbits(bytes
, write
, &kap
->write_set
, &write_tmp
);
1166 error
= getbits(bytes
, except
, &kap
->except_set
, &except_tmp
);
1171 * NOTE: Make sure the max events passed to kern_kevent() is
1172 * effectively unlimited. (nd * 3) accomplishes this.
1174 * (*res) continues to increment as returned events are
1177 error
= kern_kevent(&kap
->lwp
->lwp_kqueue
, 0x7FFFFFFF, res
, kap
,
1178 select_copyin
, select_copyout
, ts
);
1180 error
= putbits(bytes
, kap
->read_set
, read
);
1182 error
= putbits(bytes
, kap
->write_set
, write
);
1184 error
= putbits(bytes
, kap
->except_set
, except
);
1187 * An error from an individual event that should be passed
1188 * back to userland (EBADF)
1197 if (kap
->read_set
&& kap
->read_set
!= &read_tmp
)
1198 kfree(kap
->read_set
, M_SELECT
);
1199 if (kap
->write_set
&& kap
->write_set
!= &write_tmp
)
1200 kfree(kap
->write_set
, M_SELECT
);
1201 if (kap
->except_set
&& kap
->except_set
!= &except_tmp
)
1202 kfree(kap
->except_set
, M_SELECT
);
1204 kap
->lwp
->lwp_kqueue_serial
+= kap
->num_fds
;
1215 sys_poll(struct poll_args
*uap
)
1217 struct timespec ts
, *tsp
;
1220 if (uap
->timeout
!= INFTIM
) {
1221 ts
.tv_sec
= uap
->timeout
/ 1000;
1222 ts
.tv_nsec
= (uap
->timeout
% 1000) * 1000 * 1000;
1228 error
= dopoll(uap
->nfds
, uap
->fds
, tsp
, &uap
->sysmsg_result
);
1234 poll_copyin(void *arg
, struct kevent
*kevp
, int maxevents
, int *events
)
1236 struct poll_kevent_copyin_args
*pkap
;
1241 pkap
= (struct poll_kevent_copyin_args
*)arg
;
1243 while (pkap
->pfds
< pkap
->nfds
) {
1244 pfd
= &pkap
->fds
[pkap
->pfds
];
1246 /* Clear return events */
1249 /* Do not check if fd is equal to -1 */
1250 if (pfd
->fd
== -1) {
1256 if (pfd
->events
& (POLLIN
| POLLRDNORM
))
1258 if (pfd
->events
& (POLLOUT
| POLLWRNORM
))
1260 if (pfd
->events
& (POLLPRI
| POLLRDBAND
))
1263 if (*events
+ kev_count
> maxevents
)
1267 * NOTE: A combined serial number and poll array index is
1268 * stored in kev->udata.
1270 kev
= &kevp
[*events
];
1271 if (pfd
->events
& (POLLIN
| POLLRDNORM
)) {
1272 EV_SET(kev
++, pfd
->fd
, EVFILT_READ
, EV_ADD
|EV_ENABLE
,
1273 NOTE_OLDAPI
, 0, (void *)(uintptr_t)
1274 (pkap
->lwp
->lwp_kqueue_serial
+ pkap
->pfds
));
1276 if (pfd
->events
& (POLLOUT
| POLLWRNORM
)) {
1277 EV_SET(kev
++, pfd
->fd
, EVFILT_WRITE
, EV_ADD
|EV_ENABLE
,
1278 NOTE_OLDAPI
, 0, (void *)(uintptr_t)
1279 (pkap
->lwp
->lwp_kqueue_serial
+ pkap
->pfds
));
1281 if (pfd
->events
& (POLLPRI
| POLLRDBAND
)) {
1282 EV_SET(kev
++, pfd
->fd
, EVFILT_EXCEPT
, EV_ADD
|EV_ENABLE
,
1283 NOTE_OLDAPI
| NOTE_OOB
, 0,
1285 (pkap
->lwp
->lwp_kqueue_serial
+ pkap
->pfds
));
1289 kprintf("poll index %d/%d fd %d events %08x serial %d\n",
1290 pkap
->pfds
, pkap
->nfds
-1, pfd
->fd
, pfd
->events
,
1291 pkap
->lwp
->lwp_kqueue_serial
);
1295 (*events
) += kev_count
;
1302 poll_copyout(void *arg
, struct kevent
*kevp
, int count
, int *res
)
1304 struct poll_kevent_copyin_args
*pkap
;
1311 pkap
= (struct poll_kevent_copyin_args
*)arg
;
1313 for (i
= 0; i
< count
; ++i
) {
1315 * Extract the poll array index and delete spurious events.
1316 * We can easily tell if the serial number is incorrect
1317 * by checking whether the extracted index is out of range.
1319 pi
= (u_int
)(uintptr_t)kevp
[i
].udata
-
1320 (u_int
)pkap
->lwp
->lwp_kqueue_serial
;
1322 if (pi
>= pkap
->nfds
) {
1324 kev
.flags
= EV_DISABLE
|EV_DELETE
;
1325 kqueue_register(&pkap
->lwp
->lwp_kqueue
, &kev
);
1327 kprintf("poll index %d out of range against serial %d\n",
1328 pi
, pkap
->lwp
->lwp_kqueue_serial
);
1331 pfd
= &pkap
->fds
[pi
];
1332 if (kevp
[i
].ident
== pfd
->fd
) {
1334 * A single descriptor may generate an error against
1335 * more than one filter, make sure to set the
1336 * appropriate flags but do not increment (*res)
1339 count_res
= (pfd
->revents
== 0);
1340 if (kevp
[i
].flags
& EV_ERROR
) {
1341 switch(kevp
[i
].data
) {
1343 /* Bad file descriptor */
1346 pfd
->revents
|= POLLNVAL
;
1350 * Poll silently swallows any unknown
1351 * errors except in the case of POLLPRI
1352 * (OOB/urgent data).
1354 * ALWAYS filter out EOPNOTSUPP errors
1355 * from filters, common applications
1356 * set POLLPRI|POLLRDBAND and most
1357 * filters do not support EVFILT_EXCEPT.
1359 if (kevp
[i
].filter
!= EVFILT_READ
&&
1360 kevp
[i
].filter
!= EVFILT_WRITE
&&
1361 kevp
[i
].data
!= EOPNOTSUPP
) {
1364 pfd
->revents
|= POLLERR
;
1369 kprintf("poll index %d fd %d "
1370 "filter %d error %jd\n",
1373 (intmax_t)kevp
[i
].data
);
1378 switch (kevp
[i
].filter
) {
1382 * EOF on the read side can indicate a
1383 * half-closed situation and not necessarily
1384 * a disconnect, so depend on the user
1385 * issuing a read() and getting 0 bytes back.
1387 if (kevp
[i
].flags
& EV_EOF
)
1388 pfd
->revents
|= POLLHUP
;
1390 if (pfd
->events
& POLLIN
)
1391 pfd
->revents
|= POLLIN
;
1392 if (pfd
->events
& POLLRDNORM
)
1393 pfd
->revents
|= POLLRDNORM
;
1397 * As per the OpenGroup POLLHUP is mutually
1398 * exclusive with the writability flags. I
1399 * consider this a bit broken but...
1401 * In this case a disconnect is implied even
1402 * for a half-closed (write side) situation.
1404 if (kevp
[i
].flags
& EV_EOF
) {
1405 pfd
->revents
|= POLLHUP
;
1407 if (pfd
->events
& POLLOUT
)
1408 pfd
->revents
|= POLLOUT
;
1409 if (pfd
->events
& POLLWRNORM
)
1410 pfd
->revents
|= POLLWRNORM
;
1415 * EV_EOF should never be tagged for this
1418 if (pfd
->events
& POLLPRI
)
1419 pfd
->revents
|= POLLPRI
;
1420 if (pfd
->events
& POLLRDBAND
)
1421 pfd
->revents
|= POLLRDBAND
;
1426 kprintf("poll index %d/%d fd %d revents %08x\n",
1427 pi
, pkap
->nfds
, pfd
->fd
, pfd
->revents
);
1430 if (count_res
&& pfd
->revents
)
1434 kprintf("poll index %d mismatch %ju/%d\n",
1435 pi
, (uintmax_t)kevp
[i
].ident
, pfd
->fd
);
1444 dopoll(int nfds
, struct pollfd
*fds
, struct timespec
*ts
, int *res
)
1446 struct poll_kevent_copyin_args ka
;
1447 struct pollfd sfds
[64];
1456 * This is a bit arbitrary but we need to limit internal kmallocs.
1458 if (nfds
> maxfilesperproc
* 2)
1459 nfds
= maxfilesperproc
* 2;
1460 bytes
= sizeof(struct pollfd
) * nfds
;
1462 ka
.lwp
= curthread
->td_lwp
;
1470 ka
.fds
= kmalloc(bytes
, M_SELECT
, M_WAITOK
);
1472 error
= copyin(fds
, ka
.fds
, bytes
);
1474 error
= kern_kevent(&ka
.lwp
->lwp_kqueue
, 0x7FFFFFFF, res
, &ka
,
1475 poll_copyin
, poll_copyout
, ts
);
1478 error
= copyout(ka
.fds
, fds
, bytes
);
1481 kfree(ka
.fds
, M_SELECT
);
1483 ka
.lwp
->lwp_kqueue_serial
+= nfds
;
1489 socket_wait_copyin(void *arg
, struct kevent
*kevp
, int maxevents
, int *events
)
1495 socket_wait_copyout(void *arg
, struct kevent
*kevp
, int count
, int *res
)
1501 extern struct fileops socketops
;
1504 * NOTE: Callers of socket_wait() must already have a reference on the
1508 socket_wait(struct socket
*so
, struct timespec
*ts
, int *res
)
1510 struct thread
*td
= curthread
;
1516 if ((error
= falloc(td
->td_lwp
, &fp
, &fd
)) != 0)
1519 fp
->f_type
= DTYPE_SOCKET
;
1520 fp
->f_flag
= FREAD
| FWRITE
;
1521 fp
->f_ops
= &socketops
;
1523 fsetfd(td
->td_lwp
->lwp_proc
->p_fd
, fp
, fd
);
1525 kqueue_init(&kq
, td
->td_lwp
->lwp_proc
->p_fd
);
1526 EV_SET(&kev
, fd
, EVFILT_READ
, EV_ADD
|EV_ENABLE
, 0, 0, NULL
);
1527 if ((error
= kqueue_register(&kq
, &kev
)) != 0) {
1532 error
= kern_kevent(&kq
, 1, res
, NULL
, socket_wait_copyin
,
1533 socket_wait_copyout
, ts
);
1535 EV_SET(&kev
, fd
, EVFILT_READ
, EV_DELETE
, 0, 0, NULL
);
1536 kqueue_register(&kq
, &kev
);
1537 fp
->f_ops
= &badfileops
;
1544 * OpenBSD poll system call.
1545 * XXX this isn't quite a true representation.. OpenBSD uses select ops.
1550 sys_openbsd_poll(struct openbsd_poll_args
*uap
)
1552 return (sys_poll((struct poll_args
*)uap
));
1557 seltrue(cdev_t dev
, int events
)
1559 return (events
& (POLLIN
| POLLOUT
| POLLRDNORM
| POLLWRNORM
));