2 * Copyright (c) 1982, 1986, 1989, 1993
3 * The Regents of the University of California. All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. Neither the name of the University nor the names of its contributors
19 * may be used to endorse or promote products derived from this software
20 * without specific prior written permission.
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * @(#)sys_generic.c 8.5 (Berkeley) 1/21/94
35 * $FreeBSD: src/sys/kern/sys_generic.c,v 1.55.2.10 2001/03/17 10:39:32 peter Exp $
38 #include "opt_ktrace.h"
40 #include <sys/param.h>
41 #include <sys/systm.h>
42 #include <sys/sysproto.h>
43 #include <sys/event.h>
44 #include <sys/filedesc.h>
45 #include <sys/filio.h>
46 #include <sys/fcntl.h>
49 #include <sys/signalvar.h>
50 #include <sys/socketvar.h>
52 #include <sys/kernel.h>
53 #include <sys/kern_syscall.h>
54 #include <sys/malloc.h>
55 #include <sys/mapped_ioctl.h>
57 #include <sys/queue.h>
58 #include <sys/resourcevar.h>
59 #include <sys/socketops.h>
60 #include <sys/sysctl.h>
61 #include <sys/sysent.h>
64 #include <sys/ktrace.h>
67 #include <vm/vm_page.h>
69 #include <sys/file2.h>
70 #include <sys/spinlock2.h>
72 #include <machine/limits.h>
74 static MALLOC_DEFINE(M_IOCTLOPS
, "ioctlops", "ioctl data buffer");
75 static MALLOC_DEFINE(M_IOCTLMAP
, "ioctlmap", "mapped ioctl handler buffer");
76 static MALLOC_DEFINE(M_SELECT
, "select", "select() buffer");
77 MALLOC_DEFINE(M_IOV
, "iov", "large iov's");
79 typedef struct kfd_set
{
83 enum select_copyin_states
{
84 COPYIN_READ
, COPYIN_WRITE
, COPYIN_EXCEPT
, COPYIN_DONE
};
86 struct select_kevent_copyin_args
{
90 int active_set
; /* One of select_copyin_states */
91 struct lwp
*lwp
; /* Pointer to our lwp */
92 int num_fds
; /* Number of file descriptors (syscall arg) */
93 int proc_fds
; /* Processed fd's (wraps) */
94 int error
; /* Returned to userland */
97 struct poll_kevent_copyin_args
{
105 static struct lwkt_token mioctl_token
= LWKT_TOKEN_INITIALIZER(mioctl_token
);
107 static int doselect(int nd
, fd_set
*in
, fd_set
*ou
, fd_set
*ex
,
108 struct timespec
*ts
, int *res
);
109 static int dopoll(int nfds
, struct pollfd
*fds
, struct timespec
*ts
,
110 int *res
, int flags
);
111 static int dofileread(int, struct file
*, struct uio
*, int, size_t *);
112 static int dofilewrite(int, struct file
*, struct uio
*, int, size_t *);
120 sys_read(struct read_args
*uap
)
122 struct thread
*td
= curthread
;
127 if ((ssize_t
)uap
->nbyte
< 0)
130 aiov
.iov_base
= uap
->buf
;
131 aiov
.iov_len
= uap
->nbyte
;
132 auio
.uio_iov
= &aiov
;
134 auio
.uio_offset
= -1;
135 auio
.uio_resid
= uap
->nbyte
;
136 auio
.uio_rw
= UIO_READ
;
137 auio
.uio_segflg
= UIO_USERSPACE
;
140 error
= kern_preadv(uap
->fd
, &auio
, 0, &uap
->sysmsg_szresult
);
145 * Positioned (Pread) read system call
150 sys_extpread(struct extpread_args
*uap
)
152 struct thread
*td
= curthread
;
158 if ((ssize_t
)uap
->nbyte
< 0)
161 aiov
.iov_base
= uap
->buf
;
162 aiov
.iov_len
= uap
->nbyte
;
163 auio
.uio_iov
= &aiov
;
165 auio
.uio_offset
= uap
->offset
;
166 auio
.uio_resid
= uap
->nbyte
;
167 auio
.uio_rw
= UIO_READ
;
168 auio
.uio_segflg
= UIO_USERSPACE
;
171 flags
= uap
->flags
& O_FMASK
;
172 if (uap
->offset
!= (off_t
)-1)
175 error
= kern_preadv(uap
->fd
, &auio
, flags
, &uap
->sysmsg_szresult
);
180 * Scatter read system call.
185 sys_readv(struct readv_args
*uap
)
187 struct thread
*td
= curthread
;
189 struct iovec aiov
[UIO_SMALLIOV
], *iov
= NULL
;
192 error
= iovec_copyin(uap
->iovp
, &iov
, aiov
, uap
->iovcnt
,
197 auio
.uio_iovcnt
= uap
->iovcnt
;
198 auio
.uio_offset
= -1;
199 auio
.uio_rw
= UIO_READ
;
200 auio
.uio_segflg
= UIO_USERSPACE
;
203 error
= kern_preadv(uap
->fd
, &auio
, 0, &uap
->sysmsg_szresult
);
205 iovec_free(&iov
, aiov
);
211 * Scatter positioned read system call.
216 sys_extpreadv(struct extpreadv_args
*uap
)
218 struct thread
*td
= curthread
;
220 struct iovec aiov
[UIO_SMALLIOV
], *iov
= NULL
;
224 error
= iovec_copyin(uap
->iovp
, &iov
, aiov
, uap
->iovcnt
,
229 auio
.uio_iovcnt
= uap
->iovcnt
;
230 auio
.uio_offset
= uap
->offset
;
231 auio
.uio_rw
= UIO_READ
;
232 auio
.uio_segflg
= UIO_USERSPACE
;
235 flags
= uap
->flags
& O_FMASK
;
236 if (uap
->offset
!= (off_t
)-1)
239 error
= kern_preadv(uap
->fd
, &auio
, flags
, &uap
->sysmsg_szresult
);
241 iovec_free(&iov
, aiov
);
249 kern_preadv(int fd
, struct uio
*auio
, int flags
, size_t *res
)
251 struct thread
*td
= curthread
;
252 struct proc
*p
= td
->td_proc
;
258 fp
= holdfp(p
->p_fd
, fd
, FREAD
);
261 if (flags
& O_FOFFSET
&& fp
->f_type
!= DTYPE_VNODE
) {
264 error
= dofileread(fd
, fp
, auio
, flags
, res
);
271 * Common code for readv and preadv that reads data in
272 * from a file using the passed in uio, offset, and flags.
274 * MPALMOSTSAFE - ktrace needs help
277 dofileread(int fd
, struct file
*fp
, struct uio
*auio
, int flags
, size_t *res
)
282 struct thread
*td
= curthread
;
283 struct iovec
*ktriov
= NULL
;
289 * if tracing, save a copy of iovec
291 if (KTRPOINT(td
, KTR_GENIO
)) {
292 int iovlen
= auio
->uio_iovcnt
* sizeof(struct iovec
);
294 ktriov
= kmalloc(iovlen
, M_TEMP
, M_WAITOK
);
295 bcopy((caddr_t
)auio
->uio_iov
, (caddr_t
)ktriov
, iovlen
);
299 len
= auio
->uio_resid
;
300 error
= fo_read(fp
, auio
, fp
->f_cred
, flags
);
302 if (auio
->uio_resid
!= len
&& (error
== ERESTART
||
303 error
== EINTR
|| error
== EWOULDBLOCK
))
307 if (ktriov
!= NULL
) {
309 ktruio
.uio_iov
= ktriov
;
310 ktruio
.uio_resid
= len
- auio
->uio_resid
;
311 ktrgenio(td
->td_lwp
, fd
, UIO_READ
, &ktruio
, error
);
313 kfree(ktriov
, M_TEMP
);
317 *res
= len
- auio
->uio_resid
;
328 sys_write(struct write_args
*uap
)
330 struct thread
*td
= curthread
;
335 if ((ssize_t
)uap
->nbyte
< 0)
338 aiov
.iov_base
= (void *)(uintptr_t)uap
->buf
;
339 aiov
.iov_len
= uap
->nbyte
;
340 auio
.uio_iov
= &aiov
;
342 auio
.uio_offset
= -1;
343 auio
.uio_resid
= uap
->nbyte
;
344 auio
.uio_rw
= UIO_WRITE
;
345 auio
.uio_segflg
= UIO_USERSPACE
;
348 error
= kern_pwritev(uap
->fd
, &auio
, 0, &uap
->sysmsg_szresult
);
359 sys_extpwrite(struct extpwrite_args
*uap
)
361 struct thread
*td
= curthread
;
367 if ((ssize_t
)uap
->nbyte
< 0)
370 aiov
.iov_base
= (void *)(uintptr_t)uap
->buf
;
371 aiov
.iov_len
= uap
->nbyte
;
372 auio
.uio_iov
= &aiov
;
374 auio
.uio_offset
= uap
->offset
;
375 auio
.uio_resid
= uap
->nbyte
;
376 auio
.uio_rw
= UIO_WRITE
;
377 auio
.uio_segflg
= UIO_USERSPACE
;
380 flags
= uap
->flags
& O_FMASK
;
381 if (uap
->offset
!= (off_t
)-1)
383 error
= kern_pwritev(uap
->fd
, &auio
, flags
, &uap
->sysmsg_szresult
);
391 sys_writev(struct writev_args
*uap
)
393 struct thread
*td
= curthread
;
395 struct iovec aiov
[UIO_SMALLIOV
], *iov
= NULL
;
398 error
= iovec_copyin(uap
->iovp
, &iov
, aiov
, uap
->iovcnt
,
403 auio
.uio_iovcnt
= uap
->iovcnt
;
404 auio
.uio_offset
= -1;
405 auio
.uio_rw
= UIO_WRITE
;
406 auio
.uio_segflg
= UIO_USERSPACE
;
409 error
= kern_pwritev(uap
->fd
, &auio
, 0, &uap
->sysmsg_szresult
);
411 iovec_free(&iov
, aiov
);
417 * Gather positioned write system call
422 sys_extpwritev(struct extpwritev_args
*uap
)
424 struct thread
*td
= curthread
;
426 struct iovec aiov
[UIO_SMALLIOV
], *iov
= NULL
;
430 error
= iovec_copyin(uap
->iovp
, &iov
, aiov
, uap
->iovcnt
,
435 auio
.uio_iovcnt
= uap
->iovcnt
;
436 auio
.uio_offset
= uap
->offset
;
437 auio
.uio_rw
= UIO_WRITE
;
438 auio
.uio_segflg
= UIO_USERSPACE
;
441 flags
= uap
->flags
& O_FMASK
;
442 if (uap
->offset
!= (off_t
)-1)
445 error
= kern_pwritev(uap
->fd
, &auio
, flags
, &uap
->sysmsg_szresult
);
447 iovec_free(&iov
, aiov
);
455 kern_pwritev(int fd
, struct uio
*auio
, int flags
, size_t *res
)
457 struct thread
*td
= curthread
;
458 struct proc
*p
= td
->td_proc
;
464 fp
= holdfp(p
->p_fd
, fd
, FWRITE
);
467 else if ((flags
& O_FOFFSET
) && fp
->f_type
!= DTYPE_VNODE
) {
470 error
= dofilewrite(fd
, fp
, auio
, flags
, res
);
478 * Common code for writev and pwritev that writes data to
479 * a file using the passed in uio, offset, and flags.
481 * MPALMOSTSAFE - ktrace needs help
484 dofilewrite(int fd
, struct file
*fp
, struct uio
*auio
, int flags
, size_t *res
)
486 struct thread
*td
= curthread
;
487 struct lwp
*lp
= td
->td_lwp
;
491 struct iovec
*ktriov
= NULL
;
497 * if tracing, save a copy of iovec and uio
499 if (KTRPOINT(td
, KTR_GENIO
)) {
500 int iovlen
= auio
->uio_iovcnt
* sizeof(struct iovec
);
502 ktriov
= kmalloc(iovlen
, M_TEMP
, M_WAITOK
);
503 bcopy((caddr_t
)auio
->uio_iov
, (caddr_t
)ktriov
, iovlen
);
507 len
= auio
->uio_resid
;
508 error
= fo_write(fp
, auio
, fp
->f_cred
, flags
);
510 if (auio
->uio_resid
!= len
&& (error
== ERESTART
||
511 error
== EINTR
|| error
== EWOULDBLOCK
))
513 /* Socket layer is responsible for issuing SIGPIPE. */
514 if (error
== EPIPE
&& fp
->f_type
!= DTYPE_SOCKET
)
515 lwpsignal(lp
->lwp_proc
, lp
, SIGPIPE
);
518 if (ktriov
!= NULL
) {
520 ktruio
.uio_iov
= ktriov
;
521 ktruio
.uio_resid
= len
- auio
->uio_resid
;
522 ktrgenio(lp
, fd
, UIO_WRITE
, &ktruio
, error
);
524 kfree(ktriov
, M_TEMP
);
528 *res
= len
- auio
->uio_resid
;
539 sys_ioctl(struct ioctl_args
*uap
)
543 error
= mapped_ioctl(uap
->fd
, uap
->com
, uap
->data
, NULL
, &uap
->sysmsg
);
547 struct ioctl_map_entry
{
549 struct ioctl_map_range
*cmd_ranges
;
550 LIST_ENTRY(ioctl_map_entry
) entries
;
554 * The true heart of all ioctl syscall handlers (native, emulation).
555 * If map != NULL, it will be searched for a matching entry for com,
556 * and appropriate conversions/conversion functions will be utilized.
561 mapped_ioctl(int fd
, u_long com
, caddr_t uspc_data
, struct ioctl_map
*map
,
564 struct thread
*td
= curthread
;
565 struct proc
*p
= td
->td_proc
;
568 struct ioctl_map_range
*iomc
= NULL
;
574 #define STK_PARAMS 128
576 char stkbuf
[STK_PARAMS
];
584 fp
= holdfp(p
->p_fd
, fd
, FREAD
|FWRITE
);
588 if (map
!= NULL
) { /* obey translation map */
590 struct ioctl_map_entry
*e
;
592 maskcmd
= com
& map
->mask
;
594 lwkt_gettoken(&mioctl_token
);
595 LIST_FOREACH(e
, &map
->mapping
, entries
) {
596 for (iomc
= e
->cmd_ranges
; iomc
->start
!= 0 ||
597 iomc
->maptocmd
!= 0 || iomc
->wrapfunc
!= NULL
||
598 iomc
->mapfunc
!= NULL
;
600 if (maskcmd
>= iomc
->start
&&
601 maskcmd
<= iomc
->end
)
605 /* Did we find a match? */
606 if (iomc
->start
!= 0 || iomc
->maptocmd
!= 0 ||
607 iomc
->wrapfunc
!= NULL
|| iomc
->mapfunc
!= NULL
)
610 lwkt_reltoken(&mioctl_token
);
613 (iomc
->start
== 0 && iomc
->maptocmd
== 0
614 && iomc
->wrapfunc
== NULL
&& iomc
->mapfunc
== NULL
)) {
615 kprintf("%s: 'ioctl' fd=%d, cmd=0x%lx ('%c',%d) not implemented\n",
616 map
->sys
, fd
, maskcmd
,
617 (int)((maskcmd
>> 8) & 0xff),
618 (int)(maskcmd
& 0xff));
624 * If it's a non-range one to one mapping, maptocmd should be
625 * correct. If it's a ranged one to one mapping, we pass the
626 * original value of com, and for a range mapped to a different
627 * range, we always need a mapping function to translate the
628 * ioctl to our native ioctl. Ex. 6500-65ff <-> 9500-95ff
630 if (iomc
->start
== iomc
->end
&& iomc
->maptocmd
== iomc
->maptoend
) {
631 com
= iomc
->maptocmd
;
632 } else if (iomc
->start
== iomc
->maptocmd
&& iomc
->end
== iomc
->maptoend
) {
633 if (iomc
->mapfunc
!= NULL
)
634 com
= iomc
->mapfunc(iomc
->start
, iomc
->end
,
635 iomc
->start
, iomc
->end
,
638 if (iomc
->mapfunc
!= NULL
) {
639 com
= iomc
->mapfunc(iomc
->start
, iomc
->end
,
640 iomc
->maptocmd
, iomc
->maptoend
,
643 kprintf("%s: Invalid mapping for fd=%d, cmd=%#lx ('%c',%d)\n",
644 map
->sys
, fd
, maskcmd
,
645 (int)((maskcmd
>> 8) & 0xff),
646 (int)(maskcmd
& 0xff));
655 error
= fclrfdflags(p
->p_fd
, fd
, UF_EXCLOSE
);
658 error
= fsetfdflags(p
->p_fd
, fd
, UF_EXCLOSE
);
663 * Interpret high order word to find amount of data to be
664 * copied to/from the user's address space.
666 size
= IOCPARM_LEN(com
);
667 if (size
> IOCPARM_MAX
) {
672 if ((com
& IOC_VOID
) == 0 && size
> sizeof(ubuf
.stkbuf
)) {
673 memp
= kmalloc(size
, M_IOCTLOPS
, M_WAITOK
);
679 if (com
& IOC_VOID
) {
680 *(caddr_t
*)data
= uspc_data
;
681 } else if (com
& IOC_IN
) {
683 error
= copyin(uspc_data
, data
, (size_t)size
);
687 *(caddr_t
*)data
= uspc_data
;
689 } else if ((com
& IOC_OUT
) != 0 && size
) {
691 * Zero the buffer so the user always
692 * gets back something deterministic.
694 bzero(data
, (size_t)size
);
699 if ((tmp
= *(int *)data
))
700 atomic_set_int(&fp
->f_flag
, FNONBLOCK
);
702 atomic_clear_int(&fp
->f_flag
, FNONBLOCK
);
707 if ((tmp
= *(int *)data
))
708 atomic_set_int(&fp
->f_flag
, FASYNC
);
710 atomic_clear_int(&fp
->f_flag
, FASYNC
);
711 error
= fo_ioctl(fp
, FIOASYNC
, (caddr_t
)&tmp
, cred
, msg
);
716 * If there is a override function,
717 * call it instead of directly routing the call
719 if (map
!= NULL
&& iomc
->wrapfunc
!= NULL
)
720 error
= iomc
->wrapfunc(fp
, com
, ocom
, data
, cred
);
722 error
= fo_ioctl(fp
, com
, data
, cred
, msg
);
724 * Copy any data to user, size was
725 * already set and checked above.
727 if (error
== 0 && (com
& IOC_OUT
) != 0 && size
!= 0)
728 error
= copyout(data
, uspc_data
, (size_t)size
);
733 kfree(memp
, M_IOCTLOPS
);
742 mapped_ioctl_register_handler(struct ioctl_map_handler
*he
)
744 struct ioctl_map_entry
*ne
;
746 KKASSERT(he
!= NULL
&& he
->map
!= NULL
&& he
->cmd_ranges
!= NULL
&&
747 he
->subsys
!= NULL
&& *he
->subsys
!= '\0');
749 ne
= kmalloc(sizeof(struct ioctl_map_entry
), M_IOCTLMAP
,
752 ne
->subsys
= he
->subsys
;
753 ne
->cmd_ranges
= he
->cmd_ranges
;
755 lwkt_gettoken(&mioctl_token
);
756 LIST_INSERT_HEAD(&he
->map
->mapping
, ne
, entries
);
757 lwkt_reltoken(&mioctl_token
);
766 mapped_ioctl_unregister_handler(struct ioctl_map_handler
*he
)
768 struct ioctl_map_entry
*ne
;
771 KKASSERT(he
!= NULL
&& he
->map
!= NULL
&& he
->cmd_ranges
!= NULL
);
773 lwkt_gettoken(&mioctl_token
);
774 LIST_FOREACH(ne
, &he
->map
->mapping
, entries
) {
775 if (ne
->cmd_ranges
== he
->cmd_ranges
) {
776 LIST_REMOVE(ne
, entries
);
777 kfree(ne
, M_IOCTLMAP
);
782 lwkt_reltoken(&mioctl_token
);
786 static int nselcoll
; /* Select collisions since boot */
788 SYSCTL_INT(_kern
, OID_AUTO
, nselcoll
, CTLFLAG_RD
, &nselcoll
, 0, "");
789 static int nseldebug
;
790 SYSCTL_INT(_kern
, OID_AUTO
, nseldebug
, CTLFLAG_RW
, &nseldebug
, 0, "");
793 * Select system call.
798 sys_select(struct select_args
*uap
)
801 struct timespec
*ktsp
, kts
;
805 * Get timeout if any.
807 if (uap
->tv
!= NULL
) {
808 error
= copyin(uap
->tv
, &ktv
, sizeof (ktv
));
811 TIMEVAL_TO_TIMESPEC(&ktv
, &kts
);
820 error
= doselect(uap
->nd
, uap
->in
, uap
->ou
, uap
->ex
, ktsp
,
821 &uap
->sysmsg_result
);
828 * Pselect system call.
831 sys_pselect(struct pselect_args
*uap
)
833 struct thread
*td
= curthread
;
834 struct lwp
*lp
= td
->td_lwp
;
835 struct timespec
*ktsp
, kts
;
840 * Get timeout if any.
842 if (uap
->ts
!= NULL
) {
843 error
= copyin(uap
->ts
, &kts
, sizeof (kts
));
852 * Install temporary signal mask if any provided.
854 if (uap
->sigmask
!= NULL
) {
855 error
= copyin(uap
->sigmask
, &sigmask
, sizeof(sigmask
));
858 lwkt_gettoken(&lp
->lwp_proc
->p_token
);
859 lp
->lwp_oldsigmask
= lp
->lwp_sigmask
;
860 SIG_CANTMASK(sigmask
);
861 lp
->lwp_sigmask
= sigmask
;
862 lwkt_reltoken(&lp
->lwp_proc
->p_token
);
868 error
= doselect(uap
->nd
, uap
->in
, uap
->ou
, uap
->ex
, ktsp
,
869 &uap
->sysmsg_result
);
871 if (uap
->sigmask
!= NULL
) {
872 lwkt_gettoken(&lp
->lwp_proc
->p_token
);
873 /* doselect() responsible for turning ERESTART into EINTR */
874 KKASSERT(error
!= ERESTART
);
875 if (error
== EINTR
) {
877 * We can't restore the previous signal mask now
878 * because it could block the signal that interrupted
879 * us. So make a note to restore it after executing
882 lp
->lwp_flags
|= LWP_OLDMASK
;
885 * No handler to run. Restore previous mask immediately.
887 lp
->lwp_sigmask
= lp
->lwp_oldsigmask
;
889 lwkt_reltoken(&lp
->lwp_proc
->p_token
);
896 select_copyin(void *arg
, struct kevent
*kevp
, int maxevents
, int *events
)
898 struct select_kevent_copyin_args
*skap
= NULL
;
905 skap
= (struct select_kevent_copyin_args
*)arg
;
907 if (*events
== maxevents
)
910 while (skap
->active_set
< COPYIN_DONE
) {
911 switch (skap
->active_set
) {
914 * Register descriptors for the read filter
916 fdp
= skap
->read_set
;
917 filter
= EVFILT_READ
;
918 fflags
= NOTE_OLDAPI
;
926 * Register descriptors for the write filter
928 fdp
= skap
->write_set
;
929 filter
= EVFILT_WRITE
;
930 fflags
= NOTE_OLDAPI
;
938 * Register descriptors for the exception filter
940 fdp
= skap
->except_set
;
941 filter
= EVFILT_EXCEPT
;
942 fflags
= NOTE_OLDAPI
| NOTE_OOB
;
950 * Nothing left to register
956 while (skap
->proc_fds
< skap
->num_fds
) {
958 if (FD_ISSET(fd
, fdp
)) {
959 kev
= &kevp
[*events
];
960 EV_SET(kev
, fd
, filter
,
964 skap
->lwp
->lwp_kqueue_serial
);
969 kprintf("select fd %d filter %d "
970 "serial %ju\n", fd
, filter
,
972 skap
->lwp
->lwp_kqueue_serial
);
976 if (*events
== maxevents
)
987 select_copyout(void *arg
, struct kevent
*kevp
, int count
, int *res
)
989 struct select_kevent_copyin_args
*skap
;
993 skap
= (struct select_kevent_copyin_args
*)arg
;
995 for (i
= 0; i
< count
; ++i
) {
997 * Filter out and delete spurious events
999 if ((uint64_t)(uintptr_t)kevp
[i
].udata
!=
1000 skap
->lwp
->lwp_kqueue_serial
) {
1002 kev
.flags
= EV_DISABLE
|EV_DELETE
;
1003 kqueue_register(&skap
->lwp
->lwp_kqueue
, &kev
);
1005 kprintf("select fd %ju mismatched serial %ju\n",
1006 (uintmax_t)kevp
[i
].ident
,
1007 (uintmax_t)skap
->lwp
->lwp_kqueue_serial
);
1015 if (kevp
[i
].flags
& EV_ERROR
) {
1016 int error
= kevp
[i
].data
;
1021 * A bad file descriptor is considered a
1022 * fatal error for select, bail out.
1024 skap
->error
= error
;
1030 * Select silently swallows any unknown errors
1031 * for descriptors in the read or write sets.
1033 * ALWAYS filter out EOPNOTSUPP errors from
1034 * filters (at least until all filters support
1037 * We also filter out ENODEV since dev_dkqfilter
1038 * returns ENODEV if EOPNOTSUPP is returned in an
1043 if (kevp
[i
].filter
!= EVFILT_READ
&&
1044 kevp
[i
].filter
!= EVFILT_WRITE
&&
1045 error
!= EOPNOTSUPP
&&
1047 skap
->error
= error
;
1054 kprintf("select fd %ju filter %d error %d\n",
1055 (uintmax_t)kevp
[i
].ident
,
1056 kevp
[i
].filter
, error
);
1060 switch (kevp
[i
].filter
) {
1062 FD_SET(kevp
[i
].ident
, skap
->read_set
);
1065 FD_SET(kevp
[i
].ident
, skap
->write_set
);
1068 FD_SET(kevp
[i
].ident
, skap
->except_set
);
1079 * Copy select bits in from userland. Allocate kernel memory if the
1083 getbits(int bytes
, fd_set
*in_set
, kfd_set
**out_set
, kfd_set
*tmp_set
)
1088 if (bytes
< sizeof(*tmp_set
))
1091 *out_set
= kmalloc(bytes
, M_SELECT
, M_WAITOK
);
1092 error
= copyin(in_set
, *out_set
, bytes
);
1101 * Copy returned select bits back out to userland.
1104 putbits(int bytes
, kfd_set
*in_set
, fd_set
*out_set
)
1109 error
= copyout(in_set
, out_set
, bytes
);
1117 dotimeout_only(struct timespec
*ts
)
1119 return(nanosleep1(ts
, NULL
));
1123 * Common code for sys_select() and sys_pselect().
1125 * in, out and ex are userland pointers. ts must point to validated
1126 * kernel-side timeout value or NULL for infinite timeout. res must
1127 * point to syscall return value.
1130 doselect(int nd
, fd_set
*read
, fd_set
*write
, fd_set
*except
,
1131 struct timespec
*ts
, int *res
)
1133 struct proc
*p
= curproc
;
1134 struct select_kevent_copyin_args
*kap
, ka
;
1144 return (dotimeout_only(ts
));
1146 if (nd
> p
->p_fd
->fd_nfiles
) /* limit kmalloc */
1147 nd
= p
->p_fd
->fd_nfiles
;
1150 kap
->lwp
= curthread
->td_lwp
;
1154 kap
->active_set
= COPYIN_READ
;
1157 * Calculate bytes based on the number of __fd_mask[] array entries
1158 * multiplied by the size of __fd_mask.
1160 bytes
= howmany(nd
, __NFDBITS
) * sizeof(__fd_mask
);
1162 /* kap->read_set = NULL; not needed */
1163 kap
->write_set
= NULL
;
1164 kap
->except_set
= NULL
;
1166 error
= getbits(bytes
, read
, &kap
->read_set
, &read_tmp
);
1168 error
= getbits(bytes
, write
, &kap
->write_set
, &write_tmp
);
1170 error
= getbits(bytes
, except
, &kap
->except_set
, &except_tmp
);
1175 * NOTE: Make sure the max events passed to kern_kevent() is
1176 * effectively unlimited. (nd * 3) accomplishes this.
1178 * (*res) continues to increment as returned events are
1181 error
= kern_kevent(&kap
->lwp
->lwp_kqueue
, 0x7FFFFFFF, res
, kap
,
1182 select_copyin
, select_copyout
, ts
, 0);
1184 error
= putbits(bytes
, kap
->read_set
, read
);
1186 error
= putbits(bytes
, kap
->write_set
, write
);
1188 error
= putbits(bytes
, kap
->except_set
, except
);
1191 * An error from an individual event that should be passed
1192 * back to userland (EBADF)
1201 if (kap
->read_set
&& kap
->read_set
!= &read_tmp
)
1202 kfree(kap
->read_set
, M_SELECT
);
1203 if (kap
->write_set
&& kap
->write_set
!= &write_tmp
)
1204 kfree(kap
->write_set
, M_SELECT
);
1205 if (kap
->except_set
&& kap
->except_set
!= &except_tmp
)
1206 kfree(kap
->except_set
, M_SELECT
);
1208 kap
->lwp
->lwp_kqueue_serial
+= kap
->num_fds
;
1219 sys_poll(struct poll_args
*uap
)
1221 struct timespec ts
, *tsp
;
1224 if (uap
->timeout
!= INFTIM
) {
1225 if (uap
->timeout
< 0)
1227 ts
.tv_sec
= uap
->timeout
/ 1000;
1228 ts
.tv_nsec
= (uap
->timeout
% 1000) * 1000 * 1000;
1234 error
= dopoll(uap
->nfds
, uap
->fds
, tsp
, &uap
->sysmsg_result
, 0);
1240 * Ppoll system call.
1245 sys_ppoll(struct ppoll_args
*uap
)
1247 struct thread
*td
= curthread
;
1248 struct lwp
*lp
= td
->td_lwp
;
1249 struct timespec
*ktsp
, kts
;
1254 * Get timeout if any.
1256 if (uap
->ts
!= NULL
) {
1257 error
= copyin(uap
->ts
, &kts
, sizeof (kts
));
1266 * Install temporary signal mask if any provided.
1268 if (uap
->sigmask
!= NULL
) {
1269 error
= copyin(uap
->sigmask
, &sigmask
, sizeof(sigmask
));
1272 lwkt_gettoken(&lp
->lwp_proc
->p_token
);
1273 lp
->lwp_oldsigmask
= lp
->lwp_sigmask
;
1274 SIG_CANTMASK(sigmask
);
1275 lp
->lwp_sigmask
= sigmask
;
1276 lwkt_reltoken(&lp
->lwp_proc
->p_token
);
1279 error
= dopoll(uap
->nfds
, uap
->fds
, ktsp
, &uap
->sysmsg_result
,
1280 ktsp
!= NULL
? KEVENT_TIMEOUT_PRECISE
: 0);
1282 if (uap
->sigmask
!= NULL
) {
1283 lwkt_gettoken(&lp
->lwp_proc
->p_token
);
1284 /* dopoll() responsible for turning ERESTART into EINTR */
1285 KKASSERT(error
!= ERESTART
);
1286 if (error
== EINTR
) {
1288 * We can't restore the previous signal mask now
1289 * because it could block the signal that interrupted
1290 * us. So make a note to restore it after executing
1293 lp
->lwp_flags
|= LWP_OLDMASK
;
1296 * No handler to run. Restore previous mask immediately.
1298 lp
->lwp_sigmask
= lp
->lwp_oldsigmask
;
1300 lwkt_reltoken(&lp
->lwp_proc
->p_token
);
1307 poll_copyin(void *arg
, struct kevent
*kevp
, int maxevents
, int *events
)
1309 struct poll_kevent_copyin_args
*pkap
;
1314 pkap
= (struct poll_kevent_copyin_args
*)arg
;
1316 while (pkap
->pfds
< pkap
->nfds
) {
1317 pfd
= &pkap
->fds
[pkap
->pfds
];
1319 /* Clear return events */
1322 /* Do not check if fd is equal to -1 */
1323 if (pfd
->fd
== -1) {
1329 if (pfd
->events
& (POLLIN
| POLLRDNORM
))
1331 if (pfd
->events
& (POLLOUT
| POLLWRNORM
))
1333 if (pfd
->events
& (POLLPRI
| POLLRDBAND
))
1336 if (*events
+ kev_count
> maxevents
)
1340 * NOTE: A combined serial number and poll array index is
1341 * stored in kev->udata.
1343 kev
= &kevp
[*events
];
1344 if (pfd
->events
& (POLLIN
| POLLRDNORM
)) {
1345 EV_SET(kev
++, pfd
->fd
, EVFILT_READ
, EV_ADD
|EV_ENABLE
,
1346 NOTE_OLDAPI
, 0, (void *)(uintptr_t)
1347 (pkap
->lwp
->lwp_kqueue_serial
+ pkap
->pfds
));
1349 if (pfd
->events
& (POLLOUT
| POLLWRNORM
)) {
1350 EV_SET(kev
++, pfd
->fd
, EVFILT_WRITE
, EV_ADD
|EV_ENABLE
,
1351 NOTE_OLDAPI
, 0, (void *)(uintptr_t)
1352 (pkap
->lwp
->lwp_kqueue_serial
+ pkap
->pfds
));
1354 if (pfd
->events
& (POLLPRI
| POLLRDBAND
)) {
1355 EV_SET(kev
++, pfd
->fd
, EVFILT_EXCEPT
, EV_ADD
|EV_ENABLE
,
1356 NOTE_OLDAPI
| NOTE_OOB
, 0,
1358 (pkap
->lwp
->lwp_kqueue_serial
+ pkap
->pfds
));
1362 kprintf("poll index %d/%d fd %d events %08x "
1363 "serial %ju\n", pkap
->pfds
, pkap
->nfds
-1,
1364 pfd
->fd
, pfd
->events
,
1365 (uintmax_t)pkap
->lwp
->lwp_kqueue_serial
);
1369 (*events
) += kev_count
;
1376 poll_copyout(void *arg
, struct kevent
*kevp
, int count
, int *res
)
1378 struct poll_kevent_copyin_args
*pkap
;
1385 pkap
= (struct poll_kevent_copyin_args
*)arg
;
1387 for (i
= 0; i
< count
; ++i
) {
1389 * Extract the poll array index and delete spurious events.
1390 * We can easily tell if the serial number is incorrect
1391 * by checking whether the extracted index is out of range.
1393 pi
= (uint64_t)(uintptr_t)kevp
[i
].udata
-
1394 pkap
->lwp
->lwp_kqueue_serial
;
1396 if (pi
>= pkap
->nfds
) {
1398 kev
.flags
= EV_DISABLE
|EV_DELETE
;
1399 kqueue_register(&pkap
->lwp
->lwp_kqueue
, &kev
);
1401 kprintf("poll index %ju out of range against "
1402 "serial %ju\n", (uintmax_t)pi
,
1403 (uintmax_t)pkap
->lwp
->lwp_kqueue_serial
);
1407 pfd
= &pkap
->fds
[pi
];
1408 if (kevp
[i
].ident
== pfd
->fd
) {
1410 * A single descriptor may generate an error against
1411 * more than one filter, make sure to set the
1412 * appropriate flags but do not increment (*res)
1415 count_res
= (pfd
->revents
== 0);
1416 if (kevp
[i
].flags
& EV_ERROR
) {
1417 switch(kevp
[i
].data
) {
1420 /* Bad file descriptor */
1423 pfd
->revents
|= POLLNVAL
;
1427 * Poll silently swallows any unknown
1428 * errors except in the case of POLLPRI
1429 * (OOB/urgent data).
1431 * ALWAYS filter out EOPNOTSUPP errors
1432 * from filters, common applications
1433 * set POLLPRI|POLLRDBAND and most
1434 * filters do not support EVFILT_EXCEPT.
1436 * We also filter out ENODEV since dev_dkqfilter
1437 * returns ENODEV if EOPNOTSUPP is returned in an
1442 if (kevp
[i
].filter
!= EVFILT_READ
&&
1443 kevp
[i
].filter
!= EVFILT_WRITE
&&
1444 kevp
[i
].data
!= EOPNOTSUPP
&&
1445 kevp
[i
].data
!= ENODEV
) {
1448 pfd
->revents
|= POLLERR
;
1453 kprintf("poll index %ju fd %d "
1454 "filter %d error %jd\n",
1455 (uintmax_t)pi
, pfd
->fd
,
1457 (intmax_t)kevp
[i
].data
);
1462 switch (kevp
[i
].filter
) {
1466 * NODATA on the read side can indicate a
1467 * half-closed situation and not necessarily
1468 * a disconnect, so depend on the user
1469 * issuing a read() and getting 0 bytes back.
1471 if (kevp
[i
].flags
& EV_NODATA
)
1472 pfd
->revents
|= POLLHUP
;
1474 if ((kevp
[i
].flags
& EV_EOF
) &&
1475 kevp
[i
].fflags
!= 0)
1476 pfd
->revents
|= POLLERR
;
1477 if (pfd
->events
& POLLIN
)
1478 pfd
->revents
|= POLLIN
;
1479 if (pfd
->events
& POLLRDNORM
)
1480 pfd
->revents
|= POLLRDNORM
;
1484 * As per the OpenGroup POLLHUP is mutually
1485 * exclusive with the writability flags. I
1486 * consider this a bit broken but...
1488 * In this case a disconnect is implied even
1489 * for a half-closed (write side) situation.
1491 if (kevp
[i
].flags
& EV_EOF
) {
1492 pfd
->revents
|= POLLHUP
;
1493 if (kevp
[i
].fflags
!= 0)
1494 pfd
->revents
|= POLLERR
;
1496 if (pfd
->events
& POLLOUT
)
1497 pfd
->revents
|= POLLOUT
;
1498 if (pfd
->events
& POLLWRNORM
)
1499 pfd
->revents
|= POLLWRNORM
;
1504 * EV_NODATA should never be tagged for this
1507 if (pfd
->events
& POLLPRI
)
1508 pfd
->revents
|= POLLPRI
;
1509 if (pfd
->events
& POLLRDBAND
)
1510 pfd
->revents
|= POLLRDBAND
;
1515 kprintf("poll index %ju/%d fd %d "
1516 "revents %08x\n", (uintmax_t)pi
, pkap
->nfds
,
1517 pfd
->fd
, pfd
->revents
);
1520 if (count_res
&& pfd
->revents
)
1524 kprintf("poll index %ju mismatch %ju/%d\n",
1525 (uintmax_t)pi
, (uintmax_t)kevp
[i
].ident
,
1535 dopoll(int nfds
, struct pollfd
*fds
, struct timespec
*ts
, int *res
, int flags
)
1537 struct poll_kevent_copyin_args ka
;
1538 struct pollfd sfds
[64];
1546 if (nfds
== 0 && ts
)
1547 return (dotimeout_only(ts
));
1550 * This is a bit arbitrary but we need to limit internal kmallocs.
1552 if (nfds
> maxfilesperproc
* 2)
1553 nfds
= maxfilesperproc
* 2;
1554 bytes
= sizeof(struct pollfd
) * nfds
;
1556 ka
.lwp
= curthread
->td_lwp
;
1564 ka
.fds
= kmalloc(bytes
, M_SELECT
, M_WAITOK
);
1566 error
= copyin(fds
, ka
.fds
, bytes
);
1568 error
= kern_kevent(&ka
.lwp
->lwp_kqueue
, 0x7FFFFFFF, res
, &ka
,
1569 poll_copyin
, poll_copyout
, ts
, flags
);
1572 error
= copyout(ka
.fds
, fds
, bytes
);
1575 kfree(ka
.fds
, M_SELECT
);
1577 ka
.lwp
->lwp_kqueue_serial
+= nfds
;
1583 socket_wait_copyin(void *arg
, struct kevent
*kevp
, int maxevents
, int *events
)
1589 socket_wait_copyout(void *arg
, struct kevent
*kevp
, int count
, int *res
)
1595 extern struct fileops socketops
;
1598 * NOTE: Callers of socket_wait() must already have a reference on the
1602 socket_wait(struct socket
*so
, struct timespec
*ts
, int *res
)
1604 struct thread
*td
= curthread
;
1610 if ((error
= falloc(td
->td_lwp
, &fp
, &fd
)) != 0)
1613 fp
->f_type
= DTYPE_SOCKET
;
1614 fp
->f_flag
= FREAD
| FWRITE
;
1615 fp
->f_ops
= &socketops
;
1617 fsetfd(td
->td_lwp
->lwp_proc
->p_fd
, fp
, fd
);
1618 fsetfdflags(td
->td_proc
->p_fd
, fd
, UF_EXCLOSE
);
1620 bzero(&kq
, sizeof(kq
));
1621 kqueue_init(&kq
, td
->td_lwp
->lwp_proc
->p_fd
);
1622 EV_SET(&kev
, fd
, EVFILT_READ
, EV_ADD
|EV_ENABLE
, 0, 0, NULL
);
1623 if ((error
= kqueue_register(&kq
, &kev
)) != 0) {
1628 error
= kern_kevent(&kq
, 1, res
, NULL
, socket_wait_copyin
,
1629 socket_wait_copyout
, ts
, 0);
1631 EV_SET(&kev
, fd
, EVFILT_READ
, EV_DELETE
|EV_DISABLE
, 0, 0, NULL
);
1632 kqueue_register(&kq
, &kev
);
1633 fp
->f_ops
= &badfileops
;
1640 * OpenBSD poll system call.
1641 * XXX this isn't quite a true representation.. OpenBSD uses select ops.
1646 sys_openbsd_poll(struct openbsd_poll_args
*uap
)
1648 return (sys_poll((struct poll_args
*)uap
));
1653 seltrue(cdev_t dev
, int events
)
1655 return (events
& (POLLIN
| POLLOUT
| POLLRDNORM
| POLLWRNORM
));