2 * Copyright (c) 1982, 1986, 1989, 1993
3 * The Regents of the University of California. All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. Neither the name of the University nor the names of its contributors
19 * may be used to endorse or promote products derived from this software
20 * without specific prior written permission.
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * @(#)sys_generic.c 8.5 (Berkeley) 1/21/94
35 * $FreeBSD: src/sys/kern/sys_generic.c,v 1.55.2.10 2001/03/17 10:39:32 peter Exp $
38 #include "opt_ktrace.h"
40 #include <sys/param.h>
41 #include <sys/systm.h>
42 #include <sys/sysproto.h>
43 #include <sys/event.h>
44 #include <sys/filedesc.h>
45 #include <sys/filio.h>
46 #include <sys/fcntl.h>
49 #include <sys/signalvar.h>
50 #include <sys/socketvar.h>
52 #include <sys/kernel.h>
53 #include <sys/kern_syscall.h>
54 #include <sys/malloc.h>
55 #include <sys/mapped_ioctl.h>
57 #include <sys/queue.h>
58 #include <sys/resourcevar.h>
59 #include <sys/socketops.h>
60 #include <sys/sysctl.h>
61 #include <sys/sysent.h>
64 #include <sys/ktrace.h>
67 #include <vm/vm_page.h>
69 #include <sys/file2.h>
70 #include <sys/mplock2.h>
71 #include <sys/spinlock2.h>
73 #include <machine/limits.h>
75 static MALLOC_DEFINE(M_IOCTLOPS
, "ioctlops", "ioctl data buffer");
76 static MALLOC_DEFINE(M_IOCTLMAP
, "ioctlmap", "mapped ioctl handler buffer");
77 static MALLOC_DEFINE(M_SELECT
, "select", "select() buffer");
78 MALLOC_DEFINE(M_IOV
, "iov", "large iov's");
80 typedef struct kfd_set
{
84 enum select_copyin_states
{
85 COPYIN_READ
, COPYIN_WRITE
, COPYIN_EXCEPT
, COPYIN_DONE
};
87 struct select_kevent_copyin_args
{
91 int active_set
; /* One of select_copyin_states */
92 struct lwp
*lwp
; /* Pointer to our lwp */
93 int num_fds
; /* Number of file descriptors (syscall arg) */
94 int proc_fds
; /* Processed fd's (wraps) */
95 int error
; /* Returned to userland */
98 struct poll_kevent_copyin_args
{
106 static struct lwkt_token mioctl_token
= LWKT_TOKEN_INITIALIZER(mioctl_token
);
108 static int doselect(int nd
, fd_set
*in
, fd_set
*ou
, fd_set
*ex
,
109 struct timespec
*ts
, int *res
);
110 static int dopoll(int nfds
, struct pollfd
*fds
, struct timespec
*ts
,
111 int *res
, int flags
);
112 static int dofileread(int, struct file
*, struct uio
*, int, size_t *);
113 static int dofilewrite(int, struct file
*, struct uio
*, int, size_t *);
121 sys_read(struct read_args
*uap
)
123 struct thread
*td
= curthread
;
128 if ((ssize_t
)uap
->nbyte
< 0)
131 aiov
.iov_base
= uap
->buf
;
132 aiov
.iov_len
= uap
->nbyte
;
133 auio
.uio_iov
= &aiov
;
135 auio
.uio_offset
= -1;
136 auio
.uio_resid
= uap
->nbyte
;
137 auio
.uio_rw
= UIO_READ
;
138 auio
.uio_segflg
= UIO_USERSPACE
;
141 error
= kern_preadv(uap
->fd
, &auio
, 0, &uap
->sysmsg_szresult
);
146 * Positioned (Pread) read system call
151 sys_extpread(struct extpread_args
*uap
)
153 struct thread
*td
= curthread
;
159 if ((ssize_t
)uap
->nbyte
< 0)
162 aiov
.iov_base
= uap
->buf
;
163 aiov
.iov_len
= uap
->nbyte
;
164 auio
.uio_iov
= &aiov
;
166 auio
.uio_offset
= uap
->offset
;
167 auio
.uio_resid
= uap
->nbyte
;
168 auio
.uio_rw
= UIO_READ
;
169 auio
.uio_segflg
= UIO_USERSPACE
;
172 flags
= uap
->flags
& O_FMASK
;
173 if (uap
->offset
!= (off_t
)-1)
176 error
= kern_preadv(uap
->fd
, &auio
, flags
, &uap
->sysmsg_szresult
);
181 * Scatter read system call.
186 sys_readv(struct readv_args
*uap
)
188 struct thread
*td
= curthread
;
190 struct iovec aiov
[UIO_SMALLIOV
], *iov
= NULL
;
193 error
= iovec_copyin(uap
->iovp
, &iov
, aiov
, uap
->iovcnt
,
198 auio
.uio_iovcnt
= uap
->iovcnt
;
199 auio
.uio_offset
= -1;
200 auio
.uio_rw
= UIO_READ
;
201 auio
.uio_segflg
= UIO_USERSPACE
;
204 error
= kern_preadv(uap
->fd
, &auio
, 0, &uap
->sysmsg_szresult
);
206 iovec_free(&iov
, aiov
);
212 * Scatter positioned read system call.
217 sys_extpreadv(struct extpreadv_args
*uap
)
219 struct thread
*td
= curthread
;
221 struct iovec aiov
[UIO_SMALLIOV
], *iov
= NULL
;
225 error
= iovec_copyin(uap
->iovp
, &iov
, aiov
, uap
->iovcnt
,
230 auio
.uio_iovcnt
= uap
->iovcnt
;
231 auio
.uio_offset
= uap
->offset
;
232 auio
.uio_rw
= UIO_READ
;
233 auio
.uio_segflg
= UIO_USERSPACE
;
236 flags
= uap
->flags
& O_FMASK
;
237 if (uap
->offset
!= (off_t
)-1)
240 error
= kern_preadv(uap
->fd
, &auio
, flags
, &uap
->sysmsg_szresult
);
242 iovec_free(&iov
, aiov
);
250 kern_preadv(int fd
, struct uio
*auio
, int flags
, size_t *res
)
252 struct thread
*td
= curthread
;
253 struct proc
*p
= td
->td_proc
;
259 fp
= holdfp(p
->p_fd
, fd
, FREAD
);
262 if (flags
& O_FOFFSET
&& fp
->f_type
!= DTYPE_VNODE
) {
265 error
= dofileread(fd
, fp
, auio
, flags
, res
);
272 * Common code for readv and preadv that reads data in
273 * from a file using the passed in uio, offset, and flags.
275 * MPALMOSTSAFE - ktrace needs help
278 dofileread(int fd
, struct file
*fp
, struct uio
*auio
, int flags
, size_t *res
)
283 struct thread
*td
= curthread
;
284 struct iovec
*ktriov
= NULL
;
290 * if tracing, save a copy of iovec
292 if (KTRPOINT(td
, KTR_GENIO
)) {
293 int iovlen
= auio
->uio_iovcnt
* sizeof(struct iovec
);
295 ktriov
= kmalloc(iovlen
, M_TEMP
, M_WAITOK
);
296 bcopy((caddr_t
)auio
->uio_iov
, (caddr_t
)ktriov
, iovlen
);
300 len
= auio
->uio_resid
;
301 error
= fo_read(fp
, auio
, fp
->f_cred
, flags
);
303 if (auio
->uio_resid
!= len
&& (error
== ERESTART
||
304 error
== EINTR
|| error
== EWOULDBLOCK
))
308 if (ktriov
!= NULL
) {
310 ktruio
.uio_iov
= ktriov
;
311 ktruio
.uio_resid
= len
- auio
->uio_resid
;
312 ktrgenio(td
->td_lwp
, fd
, UIO_READ
, &ktruio
, error
);
314 kfree(ktriov
, M_TEMP
);
318 *res
= len
- auio
->uio_resid
;
329 sys_write(struct write_args
*uap
)
331 struct thread
*td
= curthread
;
336 if ((ssize_t
)uap
->nbyte
< 0)
339 aiov
.iov_base
= (void *)(uintptr_t)uap
->buf
;
340 aiov
.iov_len
= uap
->nbyte
;
341 auio
.uio_iov
= &aiov
;
343 auio
.uio_offset
= -1;
344 auio
.uio_resid
= uap
->nbyte
;
345 auio
.uio_rw
= UIO_WRITE
;
346 auio
.uio_segflg
= UIO_USERSPACE
;
349 error
= kern_pwritev(uap
->fd
, &auio
, 0, &uap
->sysmsg_szresult
);
360 sys_extpwrite(struct extpwrite_args
*uap
)
362 struct thread
*td
= curthread
;
368 if ((ssize_t
)uap
->nbyte
< 0)
371 aiov
.iov_base
= (void *)(uintptr_t)uap
->buf
;
372 aiov
.iov_len
= uap
->nbyte
;
373 auio
.uio_iov
= &aiov
;
375 auio
.uio_offset
= uap
->offset
;
376 auio
.uio_resid
= uap
->nbyte
;
377 auio
.uio_rw
= UIO_WRITE
;
378 auio
.uio_segflg
= UIO_USERSPACE
;
381 flags
= uap
->flags
& O_FMASK
;
382 if (uap
->offset
!= (off_t
)-1)
384 error
= kern_pwritev(uap
->fd
, &auio
, flags
, &uap
->sysmsg_szresult
);
392 sys_writev(struct writev_args
*uap
)
394 struct thread
*td
= curthread
;
396 struct iovec aiov
[UIO_SMALLIOV
], *iov
= NULL
;
399 error
= iovec_copyin(uap
->iovp
, &iov
, aiov
, uap
->iovcnt
,
404 auio
.uio_iovcnt
= uap
->iovcnt
;
405 auio
.uio_offset
= -1;
406 auio
.uio_rw
= UIO_WRITE
;
407 auio
.uio_segflg
= UIO_USERSPACE
;
410 error
= kern_pwritev(uap
->fd
, &auio
, 0, &uap
->sysmsg_szresult
);
412 iovec_free(&iov
, aiov
);
418 * Gather positioned write system call
423 sys_extpwritev(struct extpwritev_args
*uap
)
425 struct thread
*td
= curthread
;
427 struct iovec aiov
[UIO_SMALLIOV
], *iov
= NULL
;
431 error
= iovec_copyin(uap
->iovp
, &iov
, aiov
, uap
->iovcnt
,
436 auio
.uio_iovcnt
= uap
->iovcnt
;
437 auio
.uio_offset
= uap
->offset
;
438 auio
.uio_rw
= UIO_WRITE
;
439 auio
.uio_segflg
= UIO_USERSPACE
;
442 flags
= uap
->flags
& O_FMASK
;
443 if (uap
->offset
!= (off_t
)-1)
446 error
= kern_pwritev(uap
->fd
, &auio
, flags
, &uap
->sysmsg_szresult
);
448 iovec_free(&iov
, aiov
);
456 kern_pwritev(int fd
, struct uio
*auio
, int flags
, size_t *res
)
458 struct thread
*td
= curthread
;
459 struct proc
*p
= td
->td_proc
;
465 fp
= holdfp(p
->p_fd
, fd
, FWRITE
);
468 else if ((flags
& O_FOFFSET
) && fp
->f_type
!= DTYPE_VNODE
) {
471 error
= dofilewrite(fd
, fp
, auio
, flags
, res
);
479 * Common code for writev and pwritev that writes data to
480 * a file using the passed in uio, offset, and flags.
482 * MPALMOSTSAFE - ktrace needs help
485 dofilewrite(int fd
, struct file
*fp
, struct uio
*auio
, int flags
, size_t *res
)
487 struct thread
*td
= curthread
;
488 struct lwp
*lp
= td
->td_lwp
;
492 struct iovec
*ktriov
= NULL
;
498 * if tracing, save a copy of iovec and uio
500 if (KTRPOINT(td
, KTR_GENIO
)) {
501 int iovlen
= auio
->uio_iovcnt
* sizeof(struct iovec
);
503 ktriov
= kmalloc(iovlen
, M_TEMP
, M_WAITOK
);
504 bcopy((caddr_t
)auio
->uio_iov
, (caddr_t
)ktriov
, iovlen
);
508 len
= auio
->uio_resid
;
509 error
= fo_write(fp
, auio
, fp
->f_cred
, flags
);
511 if (auio
->uio_resid
!= len
&& (error
== ERESTART
||
512 error
== EINTR
|| error
== EWOULDBLOCK
))
514 /* Socket layer is responsible for issuing SIGPIPE. */
515 if (error
== EPIPE
&& fp
->f_type
!= DTYPE_SOCKET
)
516 lwpsignal(lp
->lwp_proc
, lp
, SIGPIPE
);
519 if (ktriov
!= NULL
) {
521 ktruio
.uio_iov
= ktriov
;
522 ktruio
.uio_resid
= len
- auio
->uio_resid
;
523 ktrgenio(lp
, fd
, UIO_WRITE
, &ktruio
, error
);
525 kfree(ktriov
, M_TEMP
);
529 *res
= len
- auio
->uio_resid
;
540 sys_ioctl(struct ioctl_args
*uap
)
544 error
= mapped_ioctl(uap
->fd
, uap
->com
, uap
->data
, NULL
, &uap
->sysmsg
);
548 struct ioctl_map_entry
{
550 struct ioctl_map_range
*cmd_ranges
;
551 LIST_ENTRY(ioctl_map_entry
) entries
;
555 * The true heart of all ioctl syscall handlers (native, emulation).
556 * If map != NULL, it will be searched for a matching entry for com,
557 * and appropriate conversions/conversion functions will be utilized.
562 mapped_ioctl(int fd
, u_long com
, caddr_t uspc_data
, struct ioctl_map
*map
,
565 struct thread
*td
= curthread
;
566 struct proc
*p
= td
->td_proc
;
569 struct ioctl_map_range
*iomc
= NULL
;
575 #define STK_PARAMS 128
577 char stkbuf
[STK_PARAMS
];
585 fp
= holdfp(p
->p_fd
, fd
, FREAD
|FWRITE
);
589 if (map
!= NULL
) { /* obey translation map */
591 struct ioctl_map_entry
*e
;
593 maskcmd
= com
& map
->mask
;
595 lwkt_gettoken(&mioctl_token
);
596 LIST_FOREACH(e
, &map
->mapping
, entries
) {
597 for (iomc
= e
->cmd_ranges
; iomc
->start
!= 0 ||
598 iomc
->maptocmd
!= 0 || iomc
->wrapfunc
!= NULL
||
599 iomc
->mapfunc
!= NULL
;
601 if (maskcmd
>= iomc
->start
&&
602 maskcmd
<= iomc
->end
)
606 /* Did we find a match? */
607 if (iomc
->start
!= 0 || iomc
->maptocmd
!= 0 ||
608 iomc
->wrapfunc
!= NULL
|| iomc
->mapfunc
!= NULL
)
611 lwkt_reltoken(&mioctl_token
);
614 (iomc
->start
== 0 && iomc
->maptocmd
== 0
615 && iomc
->wrapfunc
== NULL
&& iomc
->mapfunc
== NULL
)) {
616 kprintf("%s: 'ioctl' fd=%d, cmd=0x%lx ('%c',%d) not implemented\n",
617 map
->sys
, fd
, maskcmd
,
618 (int)((maskcmd
>> 8) & 0xff),
619 (int)(maskcmd
& 0xff));
625 * If it's a non-range one to one mapping, maptocmd should be
626 * correct. If it's a ranged one to one mapping, we pass the
627 * original value of com, and for a range mapped to a different
628 * range, we always need a mapping function to translate the
629 * ioctl to our native ioctl. Ex. 6500-65ff <-> 9500-95ff
631 if (iomc
->start
== iomc
->end
&& iomc
->maptocmd
== iomc
->maptoend
) {
632 com
= iomc
->maptocmd
;
633 } else if (iomc
->start
== iomc
->maptocmd
&& iomc
->end
== iomc
->maptoend
) {
634 if (iomc
->mapfunc
!= NULL
)
635 com
= iomc
->mapfunc(iomc
->start
, iomc
->end
,
636 iomc
->start
, iomc
->end
,
639 if (iomc
->mapfunc
!= NULL
) {
640 com
= iomc
->mapfunc(iomc
->start
, iomc
->end
,
641 iomc
->maptocmd
, iomc
->maptoend
,
644 kprintf("%s: Invalid mapping for fd=%d, cmd=%#lx ('%c',%d)\n",
645 map
->sys
, fd
, maskcmd
,
646 (int)((maskcmd
>> 8) & 0xff),
647 (int)(maskcmd
& 0xff));
656 error
= fclrfdflags(p
->p_fd
, fd
, UF_EXCLOSE
);
659 error
= fsetfdflags(p
->p_fd
, fd
, UF_EXCLOSE
);
664 * Interpret high order word to find amount of data to be
665 * copied to/from the user's address space.
667 size
= IOCPARM_LEN(com
);
668 if (size
> IOCPARM_MAX
) {
673 if ((com
& IOC_VOID
) == 0 && size
> sizeof(ubuf
.stkbuf
)) {
674 memp
= kmalloc(size
, M_IOCTLOPS
, M_WAITOK
);
680 if (com
& IOC_VOID
) {
681 *(caddr_t
*)data
= uspc_data
;
682 } else if (com
& IOC_IN
) {
684 error
= copyin(uspc_data
, data
, (size_t)size
);
688 *(caddr_t
*)data
= uspc_data
;
690 } else if ((com
& IOC_OUT
) != 0 && size
) {
692 * Zero the buffer so the user always
693 * gets back something deterministic.
695 bzero(data
, (size_t)size
);
700 if ((tmp
= *(int *)data
))
701 atomic_set_int(&fp
->f_flag
, FNONBLOCK
);
703 atomic_clear_int(&fp
->f_flag
, FNONBLOCK
);
708 if ((tmp
= *(int *)data
))
709 atomic_set_int(&fp
->f_flag
, FASYNC
);
711 atomic_clear_int(&fp
->f_flag
, FASYNC
);
712 error
= fo_ioctl(fp
, FIOASYNC
, (caddr_t
)&tmp
, cred
, msg
);
717 * If there is a override function,
718 * call it instead of directly routing the call
720 if (map
!= NULL
&& iomc
->wrapfunc
!= NULL
)
721 error
= iomc
->wrapfunc(fp
, com
, ocom
, data
, cred
);
723 error
= fo_ioctl(fp
, com
, data
, cred
, msg
);
725 * Copy any data to user, size was
726 * already set and checked above.
728 if (error
== 0 && (com
& IOC_OUT
) != 0 && size
!= 0)
729 error
= copyout(data
, uspc_data
, (size_t)size
);
734 kfree(memp
, M_IOCTLOPS
);
743 mapped_ioctl_register_handler(struct ioctl_map_handler
*he
)
745 struct ioctl_map_entry
*ne
;
747 KKASSERT(he
!= NULL
&& he
->map
!= NULL
&& he
->cmd_ranges
!= NULL
&&
748 he
->subsys
!= NULL
&& *he
->subsys
!= '\0');
750 ne
= kmalloc(sizeof(struct ioctl_map_entry
), M_IOCTLMAP
,
753 ne
->subsys
= he
->subsys
;
754 ne
->cmd_ranges
= he
->cmd_ranges
;
756 lwkt_gettoken(&mioctl_token
);
757 LIST_INSERT_HEAD(&he
->map
->mapping
, ne
, entries
);
758 lwkt_reltoken(&mioctl_token
);
767 mapped_ioctl_unregister_handler(struct ioctl_map_handler
*he
)
769 struct ioctl_map_entry
*ne
;
772 KKASSERT(he
!= NULL
&& he
->map
!= NULL
&& he
->cmd_ranges
!= NULL
);
774 lwkt_gettoken(&mioctl_token
);
775 LIST_FOREACH(ne
, &he
->map
->mapping
, entries
) {
776 if (ne
->cmd_ranges
== he
->cmd_ranges
) {
777 LIST_REMOVE(ne
, entries
);
778 kfree(ne
, M_IOCTLMAP
);
783 lwkt_reltoken(&mioctl_token
);
787 static int nselcoll
; /* Select collisions since boot */
789 SYSCTL_INT(_kern
, OID_AUTO
, nselcoll
, CTLFLAG_RD
, &nselcoll
, 0, "");
790 static int nseldebug
;
791 SYSCTL_INT(_kern
, OID_AUTO
, nseldebug
, CTLFLAG_RW
, &nseldebug
, 0, "");
794 * Select system call.
799 sys_select(struct select_args
*uap
)
802 struct timespec
*ktsp
, kts
;
806 * Get timeout if any.
808 if (uap
->tv
!= NULL
) {
809 error
= copyin(uap
->tv
, &ktv
, sizeof (ktv
));
812 TIMEVAL_TO_TIMESPEC(&ktv
, &kts
);
821 error
= doselect(uap
->nd
, uap
->in
, uap
->ou
, uap
->ex
, ktsp
,
822 &uap
->sysmsg_result
);
829 * Pselect system call.
832 sys_pselect(struct pselect_args
*uap
)
834 struct thread
*td
= curthread
;
835 struct lwp
*lp
= td
->td_lwp
;
836 struct timespec
*ktsp
, kts
;
841 * Get timeout if any.
843 if (uap
->ts
!= NULL
) {
844 error
= copyin(uap
->ts
, &kts
, sizeof (kts
));
853 * Install temporary signal mask if any provided.
855 if (uap
->sigmask
!= NULL
) {
856 error
= copyin(uap
->sigmask
, &sigmask
, sizeof(sigmask
));
859 lwkt_gettoken(&lp
->lwp_proc
->p_token
);
860 lp
->lwp_oldsigmask
= lp
->lwp_sigmask
;
861 SIG_CANTMASK(sigmask
);
862 lp
->lwp_sigmask
= sigmask
;
863 lwkt_reltoken(&lp
->lwp_proc
->p_token
);
869 error
= doselect(uap
->nd
, uap
->in
, uap
->ou
, uap
->ex
, ktsp
,
870 &uap
->sysmsg_result
);
872 if (uap
->sigmask
!= NULL
) {
873 lwkt_gettoken(&lp
->lwp_proc
->p_token
);
874 /* doselect() responsible for turning ERESTART into EINTR */
875 KKASSERT(error
!= ERESTART
);
876 if (error
== EINTR
) {
878 * We can't restore the previous signal mask now
879 * because it could block the signal that interrupted
880 * us. So make a note to restore it after executing
883 lp
->lwp_flags
|= LWP_OLDMASK
;
886 * No handler to run. Restore previous mask immediately.
888 lp
->lwp_sigmask
= lp
->lwp_oldsigmask
;
890 lwkt_reltoken(&lp
->lwp_proc
->p_token
);
897 select_copyin(void *arg
, struct kevent
*kevp
, int maxevents
, int *events
)
899 struct select_kevent_copyin_args
*skap
= NULL
;
906 skap
= (struct select_kevent_copyin_args
*)arg
;
908 if (*events
== maxevents
)
911 while (skap
->active_set
< COPYIN_DONE
) {
912 switch (skap
->active_set
) {
915 * Register descriptors for the read filter
917 fdp
= skap
->read_set
;
918 filter
= EVFILT_READ
;
919 fflags
= NOTE_OLDAPI
;
927 * Register descriptors for the write filter
929 fdp
= skap
->write_set
;
930 filter
= EVFILT_WRITE
;
931 fflags
= NOTE_OLDAPI
;
939 * Register descriptors for the exception filter
941 fdp
= skap
->except_set
;
942 filter
= EVFILT_EXCEPT
;
943 fflags
= NOTE_OLDAPI
| NOTE_OOB
;
951 * Nothing left to register
957 while (skap
->proc_fds
< skap
->num_fds
) {
959 if (FD_ISSET(fd
, fdp
)) {
960 kev
= &kevp
[*events
];
961 EV_SET(kev
, fd
, filter
,
965 skap
->lwp
->lwp_kqueue_serial
);
970 kprintf("select fd %d filter %d serial %d\n",
971 fd
, filter
, skap
->lwp
->lwp_kqueue_serial
);
974 if (*events
== maxevents
)
985 select_copyout(void *arg
, struct kevent
*kevp
, int count
, int *res
)
987 struct select_kevent_copyin_args
*skap
;
991 skap
= (struct select_kevent_copyin_args
*)arg
;
993 for (i
= 0; i
< count
; ++i
) {
995 * Filter out and delete spurious events
997 if ((u_int
)(uintptr_t)kevp
[i
].udata
!=
998 skap
->lwp
->lwp_kqueue_serial
) {
1000 kev
.flags
= EV_DISABLE
|EV_DELETE
;
1001 kqueue_register(&skap
->lwp
->lwp_kqueue
, &kev
);
1003 kprintf("select fd %ju mismatched serial %d\n",
1004 (uintmax_t)kevp
[i
].ident
,
1005 skap
->lwp
->lwp_kqueue_serial
);
1012 if (kevp
[i
].flags
& EV_ERROR
) {
1013 int error
= kevp
[i
].data
;
1018 * A bad file descriptor is considered a
1019 * fatal error for select, bail out.
1021 skap
->error
= error
;
1027 * Select silently swallows any unknown errors
1028 * for descriptors in the read or write sets.
1030 * ALWAYS filter out EOPNOTSUPP errors from
1031 * filters (at least until all filters support
1034 * We also filter out ENODEV since dev_dkqfilter
1035 * returns ENODEV if EOPNOTSUPP is returned in an
1040 if (kevp
[i
].filter
!= EVFILT_READ
&&
1041 kevp
[i
].filter
!= EVFILT_WRITE
&&
1042 error
!= EOPNOTSUPP
&&
1044 skap
->error
= error
;
1051 kprintf("select fd %ju filter %d error %d\n",
1052 (uintmax_t)kevp
[i
].ident
,
1053 kevp
[i
].filter
, error
);
1057 switch (kevp
[i
].filter
) {
1059 FD_SET(kevp
[i
].ident
, skap
->read_set
);
1062 FD_SET(kevp
[i
].ident
, skap
->write_set
);
1065 FD_SET(kevp
[i
].ident
, skap
->except_set
);
1076 * Copy select bits in from userland. Allocate kernel memory if the
1080 getbits(int bytes
, fd_set
*in_set
, kfd_set
**out_set
, kfd_set
*tmp_set
)
1085 if (bytes
< sizeof(*tmp_set
))
1088 *out_set
= kmalloc(bytes
, M_SELECT
, M_WAITOK
);
1089 error
= copyin(in_set
, *out_set
, bytes
);
1098 * Copy returned select bits back out to userland.
1101 putbits(int bytes
, kfd_set
*in_set
, fd_set
*out_set
)
1106 error
= copyout(in_set
, out_set
, bytes
);
1114 dotimeout_only(struct timespec
*ts
)
1116 return(nanosleep1(ts
, NULL
));
1120 * Common code for sys_select() and sys_pselect().
1122 * in, out and ex are userland pointers. ts must point to validated
1123 * kernel-side timeout value or NULL for infinite timeout. res must
1124 * point to syscall return value.
1127 doselect(int nd
, fd_set
*read
, fd_set
*write
, fd_set
*except
,
1128 struct timespec
*ts
, int *res
)
1130 struct proc
*p
= curproc
;
1131 struct select_kevent_copyin_args
*kap
, ka
;
1141 return (dotimeout_only(ts
));
1143 if (nd
> p
->p_fd
->fd_nfiles
) /* limit kmalloc */
1144 nd
= p
->p_fd
->fd_nfiles
;
1147 kap
->lwp
= curthread
->td_lwp
;
1151 kap
->active_set
= COPYIN_READ
;
1154 * Calculate bytes based on the number of __fd_mask[] array entries
1155 * multiplied by the size of __fd_mask.
1157 bytes
= howmany(nd
, __NFDBITS
) * sizeof(__fd_mask
);
1159 /* kap->read_set = NULL; not needed */
1160 kap
->write_set
= NULL
;
1161 kap
->except_set
= NULL
;
1163 error
= getbits(bytes
, read
, &kap
->read_set
, &read_tmp
);
1165 error
= getbits(bytes
, write
, &kap
->write_set
, &write_tmp
);
1167 error
= getbits(bytes
, except
, &kap
->except_set
, &except_tmp
);
1172 * NOTE: Make sure the max events passed to kern_kevent() is
1173 * effectively unlimited. (nd * 3) accomplishes this.
1175 * (*res) continues to increment as returned events are
1178 error
= kern_kevent(&kap
->lwp
->lwp_kqueue
, 0x7FFFFFFF, res
, kap
,
1179 select_copyin
, select_copyout
, ts
, 0);
1181 error
= putbits(bytes
, kap
->read_set
, read
);
1183 error
= putbits(bytes
, kap
->write_set
, write
);
1185 error
= putbits(bytes
, kap
->except_set
, except
);
1188 * An error from an individual event that should be passed
1189 * back to userland (EBADF)
1198 if (kap
->read_set
&& kap
->read_set
!= &read_tmp
)
1199 kfree(kap
->read_set
, M_SELECT
);
1200 if (kap
->write_set
&& kap
->write_set
!= &write_tmp
)
1201 kfree(kap
->write_set
, M_SELECT
);
1202 if (kap
->except_set
&& kap
->except_set
!= &except_tmp
)
1203 kfree(kap
->except_set
, M_SELECT
);
1205 kap
->lwp
->lwp_kqueue_serial
+= kap
->num_fds
;
1216 sys_poll(struct poll_args
*uap
)
1218 struct timespec ts
, *tsp
;
1221 if (uap
->timeout
!= INFTIM
) {
1222 if (uap
->timeout
< 0)
1224 ts
.tv_sec
= uap
->timeout
/ 1000;
1225 ts
.tv_nsec
= (uap
->timeout
% 1000) * 1000 * 1000;
1231 error
= dopoll(uap
->nfds
, uap
->fds
, tsp
, &uap
->sysmsg_result
, 0);
1237 * Ppoll system call.
1242 sys_ppoll(struct ppoll_args
*uap
)
1244 struct thread
*td
= curthread
;
1245 struct lwp
*lp
= td
->td_lwp
;
1246 struct timespec
*ktsp
, kts
;
1251 * Get timeout if any.
1253 if (uap
->ts
!= NULL
) {
1254 error
= copyin(uap
->ts
, &kts
, sizeof (kts
));
1263 * Install temporary signal mask if any provided.
1265 if (uap
->sigmask
!= NULL
) {
1266 error
= copyin(uap
->sigmask
, &sigmask
, sizeof(sigmask
));
1269 lwkt_gettoken(&lp
->lwp_proc
->p_token
);
1270 lp
->lwp_oldsigmask
= lp
->lwp_sigmask
;
1271 SIG_CANTMASK(sigmask
);
1272 lp
->lwp_sigmask
= sigmask
;
1273 lwkt_reltoken(&lp
->lwp_proc
->p_token
);
1276 error
= dopoll(uap
->nfds
, uap
->fds
, ktsp
, &uap
->sysmsg_result
,
1277 ktsp
!= NULL
? KEVENT_TIMEOUT_PRECISE
: 0);
1279 if (uap
->sigmask
!= NULL
) {
1280 lwkt_gettoken(&lp
->lwp_proc
->p_token
);
1281 /* dopoll() responsible for turning ERESTART into EINTR */
1282 KKASSERT(error
!= ERESTART
);
1283 if (error
== EINTR
) {
1285 * We can't restore the previous signal mask now
1286 * because it could block the signal that interrupted
1287 * us. So make a note to restore it after executing
1290 lp
->lwp_flags
|= LWP_OLDMASK
;
1293 * No handler to run. Restore previous mask immediately.
1295 lp
->lwp_sigmask
= lp
->lwp_oldsigmask
;
1297 lwkt_reltoken(&lp
->lwp_proc
->p_token
);
1304 poll_copyin(void *arg
, struct kevent
*kevp
, int maxevents
, int *events
)
1306 struct poll_kevent_copyin_args
*pkap
;
1311 pkap
= (struct poll_kevent_copyin_args
*)arg
;
1313 while (pkap
->pfds
< pkap
->nfds
) {
1314 pfd
= &pkap
->fds
[pkap
->pfds
];
1316 /* Clear return events */
1319 /* Do not check if fd is equal to -1 */
1320 if (pfd
->fd
== -1) {
1326 if (pfd
->events
& (POLLIN
| POLLRDNORM
))
1328 if (pfd
->events
& (POLLOUT
| POLLWRNORM
))
1330 if (pfd
->events
& (POLLPRI
| POLLRDBAND
))
1333 if (*events
+ kev_count
> maxevents
)
1337 * NOTE: A combined serial number and poll array index is
1338 * stored in kev->udata.
1340 kev
= &kevp
[*events
];
1341 if (pfd
->events
& (POLLIN
| POLLRDNORM
)) {
1342 EV_SET(kev
++, pfd
->fd
, EVFILT_READ
, EV_ADD
|EV_ENABLE
,
1343 NOTE_OLDAPI
, 0, (void *)(uintptr_t)
1344 (pkap
->lwp
->lwp_kqueue_serial
+ pkap
->pfds
));
1346 if (pfd
->events
& (POLLOUT
| POLLWRNORM
)) {
1347 EV_SET(kev
++, pfd
->fd
, EVFILT_WRITE
, EV_ADD
|EV_ENABLE
,
1348 NOTE_OLDAPI
, 0, (void *)(uintptr_t)
1349 (pkap
->lwp
->lwp_kqueue_serial
+ pkap
->pfds
));
1351 if (pfd
->events
& (POLLPRI
| POLLRDBAND
)) {
1352 EV_SET(kev
++, pfd
->fd
, EVFILT_EXCEPT
, EV_ADD
|EV_ENABLE
,
1353 NOTE_OLDAPI
| NOTE_OOB
, 0,
1355 (pkap
->lwp
->lwp_kqueue_serial
+ pkap
->pfds
));
1359 kprintf("poll index %d/%d fd %d events %08x serial %d\n",
1360 pkap
->pfds
, pkap
->nfds
-1, pfd
->fd
, pfd
->events
,
1361 pkap
->lwp
->lwp_kqueue_serial
);
1365 (*events
) += kev_count
;
1372 poll_copyout(void *arg
, struct kevent
*kevp
, int count
, int *res
)
1374 struct poll_kevent_copyin_args
*pkap
;
1381 pkap
= (struct poll_kevent_copyin_args
*)arg
;
1383 for (i
= 0; i
< count
; ++i
) {
1385 * Extract the poll array index and delete spurious events.
1386 * We can easily tell if the serial number is incorrect
1387 * by checking whether the extracted index is out of range.
1389 pi
= (u_int
)(uintptr_t)kevp
[i
].udata
-
1390 (u_int
)pkap
->lwp
->lwp_kqueue_serial
;
1392 if (pi
>= pkap
->nfds
) {
1394 kev
.flags
= EV_DISABLE
|EV_DELETE
;
1395 kqueue_register(&pkap
->lwp
->lwp_kqueue
, &kev
);
1397 kprintf("poll index %d out of range against serial %d\n",
1398 pi
, pkap
->lwp
->lwp_kqueue_serial
);
1401 pfd
= &pkap
->fds
[pi
];
1402 if (kevp
[i
].ident
== pfd
->fd
) {
1404 * A single descriptor may generate an error against
1405 * more than one filter, make sure to set the
1406 * appropriate flags but do not increment (*res)
1409 count_res
= (pfd
->revents
== 0);
1410 if (kevp
[i
].flags
& EV_ERROR
) {
1411 switch(kevp
[i
].data
) {
1414 /* Bad file descriptor */
1417 pfd
->revents
|= POLLNVAL
;
1421 * Poll silently swallows any unknown
1422 * errors except in the case of POLLPRI
1423 * (OOB/urgent data).
1425 * ALWAYS filter out EOPNOTSUPP errors
1426 * from filters, common applications
1427 * set POLLPRI|POLLRDBAND and most
1428 * filters do not support EVFILT_EXCEPT.
1430 * We also filter out ENODEV since dev_dkqfilter
1431 * returns ENODEV if EOPNOTSUPP is returned in an
1436 if (kevp
[i
].filter
!= EVFILT_READ
&&
1437 kevp
[i
].filter
!= EVFILT_WRITE
&&
1438 kevp
[i
].data
!= EOPNOTSUPP
&&
1439 kevp
[i
].data
!= ENODEV
) {
1442 pfd
->revents
|= POLLERR
;
1447 kprintf("poll index %d fd %d "
1448 "filter %d error %jd\n",
1451 (intmax_t)kevp
[i
].data
);
1456 switch (kevp
[i
].filter
) {
1460 * NODATA on the read side can indicate a
1461 * half-closed situation and not necessarily
1462 * a disconnect, so depend on the user
1463 * issuing a read() and getting 0 bytes back.
1465 if (kevp
[i
].flags
& EV_NODATA
)
1466 pfd
->revents
|= POLLHUP
;
1468 if ((kevp
[i
].flags
& EV_EOF
) &&
1469 kevp
[i
].fflags
!= 0)
1470 pfd
->revents
|= POLLERR
;
1471 if (pfd
->events
& POLLIN
)
1472 pfd
->revents
|= POLLIN
;
1473 if (pfd
->events
& POLLRDNORM
)
1474 pfd
->revents
|= POLLRDNORM
;
1478 * As per the OpenGroup POLLHUP is mutually
1479 * exclusive with the writability flags. I
1480 * consider this a bit broken but...
1482 * In this case a disconnect is implied even
1483 * for a half-closed (write side) situation.
1485 if (kevp
[i
].flags
& EV_EOF
) {
1486 pfd
->revents
|= POLLHUP
;
1487 if (kevp
[i
].fflags
!= 0)
1488 pfd
->revents
|= POLLERR
;
1490 if (pfd
->events
& POLLOUT
)
1491 pfd
->revents
|= POLLOUT
;
1492 if (pfd
->events
& POLLWRNORM
)
1493 pfd
->revents
|= POLLWRNORM
;
1498 * EV_NODATA should never be tagged for this
1501 if (pfd
->events
& POLLPRI
)
1502 pfd
->revents
|= POLLPRI
;
1503 if (pfd
->events
& POLLRDBAND
)
1504 pfd
->revents
|= POLLRDBAND
;
1509 kprintf("poll index %d/%d fd %d revents %08x\n",
1510 pi
, pkap
->nfds
, pfd
->fd
, pfd
->revents
);
1513 if (count_res
&& pfd
->revents
)
1517 kprintf("poll index %d mismatch %ju/%d\n",
1518 pi
, (uintmax_t)kevp
[i
].ident
, pfd
->fd
);
1527 dopoll(int nfds
, struct pollfd
*fds
, struct timespec
*ts
, int *res
, int flags
)
1529 struct poll_kevent_copyin_args ka
;
1530 struct pollfd sfds
[64];
1538 if (nfds
== 0 && ts
)
1539 return (dotimeout_only(ts
));
1542 * This is a bit arbitrary but we need to limit internal kmallocs.
1544 if (nfds
> maxfilesperproc
* 2)
1545 nfds
= maxfilesperproc
* 2;
1546 bytes
= sizeof(struct pollfd
) * nfds
;
1548 ka
.lwp
= curthread
->td_lwp
;
1556 ka
.fds
= kmalloc(bytes
, M_SELECT
, M_WAITOK
);
1558 error
= copyin(fds
, ka
.fds
, bytes
);
1560 error
= kern_kevent(&ka
.lwp
->lwp_kqueue
, 0x7FFFFFFF, res
, &ka
,
1561 poll_copyin
, poll_copyout
, ts
, flags
);
1564 error
= copyout(ka
.fds
, fds
, bytes
);
1567 kfree(ka
.fds
, M_SELECT
);
1569 ka
.lwp
->lwp_kqueue_serial
+= nfds
;
1575 socket_wait_copyin(void *arg
, struct kevent
*kevp
, int maxevents
, int *events
)
1581 socket_wait_copyout(void *arg
, struct kevent
*kevp
, int count
, int *res
)
1587 extern struct fileops socketops
;
1590 * NOTE: Callers of socket_wait() must already have a reference on the
1594 socket_wait(struct socket
*so
, struct timespec
*ts
, int *res
)
1596 struct thread
*td
= curthread
;
1602 if ((error
= falloc(td
->td_lwp
, &fp
, &fd
)) != 0)
1605 fp
->f_type
= DTYPE_SOCKET
;
1606 fp
->f_flag
= FREAD
| FWRITE
;
1607 fp
->f_ops
= &socketops
;
1609 fsetfd(td
->td_lwp
->lwp_proc
->p_fd
, fp
, fd
);
1610 fsetfdflags(td
->td_proc
->p_fd
, fd
, UF_EXCLOSE
);
1612 bzero(&kq
, sizeof(kq
));
1613 kqueue_init(&kq
, td
->td_lwp
->lwp_proc
->p_fd
);
1614 EV_SET(&kev
, fd
, EVFILT_READ
, EV_ADD
|EV_ENABLE
, 0, 0, NULL
);
1615 if ((error
= kqueue_register(&kq
, &kev
)) != 0) {
1620 error
= kern_kevent(&kq
, 1, res
, NULL
, socket_wait_copyin
,
1621 socket_wait_copyout
, ts
, 0);
1623 EV_SET(&kev
, fd
, EVFILT_READ
, EV_DELETE
|EV_DISABLE
, 0, 0, NULL
);
1624 kqueue_register(&kq
, &kev
);
1625 fp
->f_ops
= &badfileops
;
1632 * OpenBSD poll system call.
1633 * XXX this isn't quite a true representation.. OpenBSD uses select ops.
1638 sys_openbsd_poll(struct openbsd_poll_args
*uap
)
1640 return (sys_poll((struct poll_args
*)uap
));
1645 seltrue(cdev_t dev
, int events
)
1647 return (events
& (POLLIN
| POLLOUT
| POLLRDNORM
| POLLWRNORM
));