2 * Copyright (c) 1982, 1986, 1989, 1993
3 * The Regents of the University of California. All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. Neither the name of the University nor the names of its contributors
19 * may be used to endorse or promote products derived from this software
20 * without specific prior written permission.
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * @(#)sys_generic.c 8.5 (Berkeley) 1/21/94
35 * $FreeBSD: src/sys/kern/sys_generic.c,v 1.55.2.10 2001/03/17 10:39:32 peter Exp $
38 #include "opt_ktrace.h"
40 #include <sys/param.h>
41 #include <sys/systm.h>
42 #include <sys/sysmsg.h>
43 #include <sys/event.h>
44 #include <sys/filedesc.h>
45 #include <sys/filio.h>
46 #include <sys/fcntl.h>
49 #include <sys/signalvar.h>
50 #include <sys/socketvar.h>
51 #include <sys/malloc.h>
53 #include <sys/kernel.h>
54 #include <sys/kern_syscall.h>
55 #include <sys/mapped_ioctl.h>
57 #include <sys/queue.h>
58 #include <sys/resourcevar.h>
59 #include <sys/socketops.h>
60 #include <sys/sysctl.h>
61 #include <sys/sysent.h>
64 #include <sys/ktrace.h>
67 #include <vm/vm_page.h>
69 #include <sys/file2.h>
70 #include <sys/spinlock2.h>
72 #include <machine/limits.h>
74 static MALLOC_DEFINE(M_IOCTLOPS
, "ioctlops", "ioctl data buffer");
75 static MALLOC_DEFINE(M_IOCTLMAP
, "ioctlmap", "mapped ioctl handler buffer");
76 static MALLOC_DEFINE(M_SELECT
, "select", "select() buffer");
77 MALLOC_DEFINE(M_IOV
, "iov", "large iov's");
79 typedef struct kfd_set
{
83 enum select_copyin_states
{
84 COPYIN_READ
, COPYIN_WRITE
, COPYIN_EXCEPT
, COPYIN_DONE
};
86 struct select_kevent_copyin_args
{
90 int active_set
; /* One of select_copyin_states */
91 struct lwp
*lwp
; /* Pointer to our lwp */
92 int num_fds
; /* Number of file descriptors (syscall arg) */
93 int proc_fds
; /* Processed fd's (wraps) */
94 int error
; /* Returned to userland */
97 struct poll_kevent_copyin_args
{
105 static struct lwkt_token mioctl_token
= LWKT_TOKEN_INITIALIZER(mioctl_token
);
107 static int doselect(int nd
, fd_set
*in
, fd_set
*ou
, fd_set
*ex
,
108 struct timespec
*ts
, int *res
);
109 static int dopoll(int nfds
, struct pollfd
*fds
, struct timespec
*ts
,
110 int *res
, int flags
);
111 static int dofileread(int, struct file
*, struct uio
*, int, size_t *);
112 static int dofilewrite(int, struct file
*, struct uio
*, int, size_t *);
120 sys_read(struct sysmsg
*sysmsg
, const struct read_args
*uap
)
122 struct thread
*td
= curthread
;
127 if ((ssize_t
)uap
->nbyte
< 0)
130 aiov
.iov_base
= uap
->buf
;
131 aiov
.iov_len
= uap
->nbyte
;
132 auio
.uio_iov
= &aiov
;
134 auio
.uio_offset
= -1;
135 auio
.uio_resid
= uap
->nbyte
;
136 auio
.uio_rw
= UIO_READ
;
137 auio
.uio_segflg
= UIO_USERSPACE
;
140 error
= kern_preadv(uap
->fd
, &auio
, 0, &sysmsg
->sysmsg_szresult
);
145 * Positioned (Pread) read system call
150 sys_extpread(struct sysmsg
*sysmsg
, const struct extpread_args
*uap
)
152 struct thread
*td
= curthread
;
158 if ((ssize_t
)uap
->nbyte
< 0)
161 aiov
.iov_base
= uap
->buf
;
162 aiov
.iov_len
= uap
->nbyte
;
163 auio
.uio_iov
= &aiov
;
165 auio
.uio_offset
= uap
->offset
;
166 auio
.uio_resid
= uap
->nbyte
;
167 auio
.uio_rw
= UIO_READ
;
168 auio
.uio_segflg
= UIO_USERSPACE
;
171 flags
= uap
->flags
& O_FMASK
;
172 if (uap
->offset
!= (off_t
)-1)
175 error
= kern_preadv(uap
->fd
, &auio
, flags
, &sysmsg
->sysmsg_szresult
);
180 * Scatter read system call.
185 sys_readv(struct sysmsg
*sysmsg
, const struct readv_args
*uap
)
187 struct thread
*td
= curthread
;
189 struct iovec aiov
[UIO_SMALLIOV
], *iov
= NULL
;
192 error
= iovec_copyin(uap
->iovp
, &iov
, aiov
, uap
->iovcnt
,
197 auio
.uio_iovcnt
= uap
->iovcnt
;
198 auio
.uio_offset
= -1;
199 auio
.uio_rw
= UIO_READ
;
200 auio
.uio_segflg
= UIO_USERSPACE
;
203 error
= kern_preadv(uap
->fd
, &auio
, 0, &sysmsg
->sysmsg_szresult
);
205 iovec_free(&iov
, aiov
);
211 * Scatter positioned read system call.
216 sys_extpreadv(struct sysmsg
*sysmsg
, const struct extpreadv_args
*uap
)
218 struct thread
*td
= curthread
;
220 struct iovec aiov
[UIO_SMALLIOV
], *iov
= NULL
;
224 error
= iovec_copyin(uap
->iovp
, &iov
, aiov
, uap
->iovcnt
,
229 auio
.uio_iovcnt
= uap
->iovcnt
;
230 auio
.uio_offset
= uap
->offset
;
231 auio
.uio_rw
= UIO_READ
;
232 auio
.uio_segflg
= UIO_USERSPACE
;
235 flags
= uap
->flags
& O_FMASK
;
236 if (uap
->offset
!= (off_t
)-1)
239 error
= kern_preadv(uap
->fd
, &auio
, flags
, &sysmsg
->sysmsg_szresult
);
241 iovec_free(&iov
, aiov
);
249 kern_preadv(int fd
, struct uio
*auio
, int flags
, size_t *res
)
251 struct thread
*td
= curthread
;
255 fp
= holdfp(td
, fd
, FREAD
);
258 if (flags
& O_FOFFSET
&& fp
->f_type
!= DTYPE_VNODE
) {
261 error
= dofileread(fd
, fp
, auio
, flags
, res
);
269 * Common code for readv and preadv that reads data in
270 * from a file using the passed in uio, offset, and flags.
272 * MPALMOSTSAFE - ktrace needs help
275 dofileread(int fd
, struct file
*fp
, struct uio
*auio
, int flags
, size_t *res
)
280 struct thread
*td
= curthread
;
281 struct iovec
*ktriov
= NULL
;
287 * if tracing, save a copy of iovec
289 if (KTRPOINT(td
, KTR_GENIO
)) {
290 int iovlen
= auio
->uio_iovcnt
* sizeof(struct iovec
);
292 ktriov
= kmalloc(iovlen
, M_TEMP
, M_WAITOK
);
293 bcopy((caddr_t
)auio
->uio_iov
, (caddr_t
)ktriov
, iovlen
);
297 len
= auio
->uio_resid
;
298 error
= fo_read(fp
, auio
, fp
->f_cred
, flags
);
300 if (auio
->uio_resid
!= len
&& (error
== ERESTART
||
301 error
== EINTR
|| error
== EWOULDBLOCK
))
305 if (ktriov
!= NULL
) {
307 ktruio
.uio_iov
= ktriov
;
308 ktruio
.uio_resid
= len
- auio
->uio_resid
;
309 ktrgenio(td
->td_lwp
, fd
, UIO_READ
, &ktruio
, error
);
311 kfree(ktriov
, M_TEMP
);
315 *res
= len
- auio
->uio_resid
;
326 sys_write(struct sysmsg
*sysmsg
, const struct write_args
*uap
)
328 struct thread
*td
= curthread
;
333 if ((ssize_t
)uap
->nbyte
< 0)
336 aiov
.iov_base
= (void *)(uintptr_t)uap
->buf
;
337 aiov
.iov_len
= uap
->nbyte
;
338 auio
.uio_iov
= &aiov
;
340 auio
.uio_offset
= -1;
341 auio
.uio_resid
= uap
->nbyte
;
342 auio
.uio_rw
= UIO_WRITE
;
343 auio
.uio_segflg
= UIO_USERSPACE
;
346 error
= kern_pwritev(uap
->fd
, &auio
, 0, &sysmsg
->sysmsg_szresult
);
357 sys_extpwrite(struct sysmsg
*sysmsg
, const struct extpwrite_args
*uap
)
359 struct thread
*td
= curthread
;
365 if ((ssize_t
)uap
->nbyte
< 0)
368 aiov
.iov_base
= (void *)(uintptr_t)uap
->buf
;
369 aiov
.iov_len
= uap
->nbyte
;
370 auio
.uio_iov
= &aiov
;
372 auio
.uio_offset
= uap
->offset
;
373 auio
.uio_resid
= uap
->nbyte
;
374 auio
.uio_rw
= UIO_WRITE
;
375 auio
.uio_segflg
= UIO_USERSPACE
;
378 flags
= uap
->flags
& O_FMASK
;
379 if (uap
->offset
!= (off_t
)-1)
381 error
= kern_pwritev(uap
->fd
, &auio
, flags
, &sysmsg
->sysmsg_szresult
);
389 sys_writev(struct sysmsg
*sysmsg
, const struct writev_args
*uap
)
391 struct thread
*td
= curthread
;
393 struct iovec aiov
[UIO_SMALLIOV
], *iov
= NULL
;
396 error
= iovec_copyin(uap
->iovp
, &iov
, aiov
, uap
->iovcnt
,
401 auio
.uio_iovcnt
= uap
->iovcnt
;
402 auio
.uio_offset
= -1;
403 auio
.uio_rw
= UIO_WRITE
;
404 auio
.uio_segflg
= UIO_USERSPACE
;
407 error
= kern_pwritev(uap
->fd
, &auio
, 0, &sysmsg
->sysmsg_szresult
);
409 iovec_free(&iov
, aiov
);
415 * Gather positioned write system call
420 sys_extpwritev(struct sysmsg
*sysmsg
, const struct extpwritev_args
*uap
)
422 struct thread
*td
= curthread
;
424 struct iovec aiov
[UIO_SMALLIOV
], *iov
= NULL
;
428 error
= iovec_copyin(uap
->iovp
, &iov
, aiov
, uap
->iovcnt
,
433 auio
.uio_iovcnt
= uap
->iovcnt
;
434 auio
.uio_offset
= uap
->offset
;
435 auio
.uio_rw
= UIO_WRITE
;
436 auio
.uio_segflg
= UIO_USERSPACE
;
439 flags
= uap
->flags
& O_FMASK
;
440 if (uap
->offset
!= (off_t
)-1)
443 error
= kern_pwritev(uap
->fd
, &auio
, flags
, &sysmsg
->sysmsg_szresult
);
445 iovec_free(&iov
, aiov
);
453 kern_pwritev(int fd
, struct uio
*auio
, int flags
, size_t *res
)
455 struct thread
*td
= curthread
;
459 fp
= holdfp(td
, fd
, FWRITE
);
462 else if ((flags
& O_FOFFSET
) && fp
->f_type
!= DTYPE_VNODE
) {
465 error
= dofilewrite(fd
, fp
, auio
, flags
, res
);
473 * Common code for writev and pwritev that writes data to
474 * a file using the passed in uio, offset, and flags.
476 * MPALMOSTSAFE - ktrace needs help
479 dofilewrite(int fd
, struct file
*fp
, struct uio
*auio
, int flags
, size_t *res
)
481 struct thread
*td
= curthread
;
482 struct lwp
*lp
= td
->td_lwp
;
486 struct iovec
*ktriov
= NULL
;
492 * if tracing, save a copy of iovec and uio
494 if (KTRPOINT(td
, KTR_GENIO
)) {
495 int iovlen
= auio
->uio_iovcnt
* sizeof(struct iovec
);
497 ktriov
= kmalloc(iovlen
, M_TEMP
, M_WAITOK
);
498 bcopy((caddr_t
)auio
->uio_iov
, (caddr_t
)ktriov
, iovlen
);
502 len
= auio
->uio_resid
;
503 error
= fo_write(fp
, auio
, fp
->f_cred
, flags
);
505 if (auio
->uio_resid
!= len
&& (error
== ERESTART
||
506 error
== EINTR
|| error
== EWOULDBLOCK
))
508 /* Socket layer is responsible for issuing SIGPIPE. */
509 if (error
== EPIPE
&& fp
->f_type
!= DTYPE_SOCKET
)
510 lwpsignal(lp
->lwp_proc
, lp
, SIGPIPE
);
513 if (ktriov
!= NULL
) {
515 ktruio
.uio_iov
= ktriov
;
516 ktruio
.uio_resid
= len
- auio
->uio_resid
;
517 ktrgenio(lp
, fd
, UIO_WRITE
, &ktruio
, error
);
519 kfree(ktriov
, M_TEMP
);
523 *res
= len
- auio
->uio_resid
;
534 sys_ioctl(struct sysmsg
*sysmsg
, const struct ioctl_args
*uap
)
538 error
= mapped_ioctl(uap
->fd
, uap
->com
, uap
->data
, NULL
, sysmsg
);
542 struct ioctl_map_entry
{
544 struct ioctl_map_range
*cmd_ranges
;
545 LIST_ENTRY(ioctl_map_entry
) entries
;
549 * The true heart of all ioctl syscall handlers (native, emulation).
550 * If map != NULL, it will be searched for a matching entry for com,
551 * and appropriate conversions/conversion functions will be utilized.
556 mapped_ioctl(int fd
, u_long com
, caddr_t uspc_data
, struct ioctl_map
*map
,
559 struct thread
*td
= curthread
;
560 struct proc
*p
= td
->td_proc
;
563 struct ioctl_map_range
*iomc
= NULL
;
569 #define STK_PARAMS 128
571 char stkbuf
[STK_PARAMS
];
579 fp
= holdfp(td
, fd
, FREAD
|FWRITE
);
583 if (map
!= NULL
) { /* obey translation map */
585 struct ioctl_map_entry
*e
;
587 maskcmd
= com
& map
->mask
;
589 lwkt_gettoken(&mioctl_token
);
590 LIST_FOREACH(e
, &map
->mapping
, entries
) {
591 for (iomc
= e
->cmd_ranges
; iomc
->start
!= 0 ||
592 iomc
->maptocmd
!= 0 || iomc
->wrapfunc
!= NULL
||
593 iomc
->mapfunc
!= NULL
;
595 if (maskcmd
>= iomc
->start
&&
596 maskcmd
<= iomc
->end
)
600 /* Did we find a match? */
601 if (iomc
->start
!= 0 || iomc
->maptocmd
!= 0 ||
602 iomc
->wrapfunc
!= NULL
|| iomc
->mapfunc
!= NULL
)
605 lwkt_reltoken(&mioctl_token
);
608 (iomc
->start
== 0 && iomc
->maptocmd
== 0
609 && iomc
->wrapfunc
== NULL
&& iomc
->mapfunc
== NULL
)) {
610 kprintf("%s: 'ioctl' fd=%d, cmd=0x%lx ('%c',%d) not implemented\n",
611 map
->sys
, fd
, maskcmd
,
612 (int)((maskcmd
>> 8) & 0xff),
613 (int)(maskcmd
& 0xff));
619 * If it's a non-range one to one mapping, maptocmd should be
620 * correct. If it's a ranged one to one mapping, we pass the
621 * original value of com, and for a range mapped to a different
622 * range, we always need a mapping function to translate the
623 * ioctl to our native ioctl. Ex. 6500-65ff <-> 9500-95ff
625 if (iomc
->start
== iomc
->end
&& iomc
->maptocmd
== iomc
->maptoend
) {
626 com
= iomc
->maptocmd
;
627 } else if (iomc
->start
== iomc
->maptocmd
&& iomc
->end
== iomc
->maptoend
) {
628 if (iomc
->mapfunc
!= NULL
)
629 com
= iomc
->mapfunc(iomc
->start
, iomc
->end
,
630 iomc
->start
, iomc
->end
,
633 if (iomc
->mapfunc
!= NULL
) {
634 com
= iomc
->mapfunc(iomc
->start
, iomc
->end
,
635 iomc
->maptocmd
, iomc
->maptoend
,
638 kprintf("%s: Invalid mapping for fd=%d, cmd=%#lx ('%c',%d)\n",
639 map
->sys
, fd
, maskcmd
,
640 (int)((maskcmd
>> 8) & 0xff),
641 (int)(maskcmd
& 0xff));
650 error
= fclrfdflags(p
->p_fd
, fd
, UF_EXCLOSE
);
653 error
= fsetfdflags(p
->p_fd
, fd
, UF_EXCLOSE
);
658 * Interpret high order word to find amount of data to be
659 * copied to/from the user's address space.
661 size
= IOCPARM_LEN(com
);
662 if (size
> IOCPARM_MAX
) {
667 if ((com
& IOC_VOID
) == 0 && size
> sizeof(ubuf
.stkbuf
)) {
668 memp
= kmalloc(size
, M_IOCTLOPS
, M_WAITOK
);
674 if (com
& IOC_VOID
) {
675 *(caddr_t
*)data
= uspc_data
;
676 } else if (com
& IOC_IN
) {
678 error
= copyin(uspc_data
, data
, (size_t)size
);
682 *(caddr_t
*)data
= uspc_data
;
684 } else if ((com
& IOC_OUT
) != 0 && size
) {
686 * Zero the buffer so the user always
687 * gets back something deterministic.
689 bzero(data
, (size_t)size
);
694 if ((tmp
= *(int *)data
))
695 atomic_set_int(&fp
->f_flag
, FNONBLOCK
);
697 atomic_clear_int(&fp
->f_flag
, FNONBLOCK
);
702 if ((tmp
= *(int *)data
))
703 atomic_set_int(&fp
->f_flag
, FASYNC
);
705 atomic_clear_int(&fp
->f_flag
, FASYNC
);
706 error
= fo_ioctl(fp
, FIOASYNC
, (caddr_t
)&tmp
, cred
, msg
);
711 * If there is a override function,
712 * call it instead of directly routing the call
714 if (map
!= NULL
&& iomc
->wrapfunc
!= NULL
)
715 error
= iomc
->wrapfunc(fp
, com
, ocom
, data
, cred
);
717 error
= fo_ioctl(fp
, com
, data
, cred
, msg
);
719 * Copy any data to user, size was
720 * already set and checked above.
722 if (error
== 0 && (com
& IOC_OUT
) != 0 && size
!= 0)
723 error
= copyout(data
, uspc_data
, (size_t)size
);
728 kfree(memp
, M_IOCTLOPS
);
738 mapped_ioctl_register_handler(struct ioctl_map_handler
*he
)
740 struct ioctl_map_entry
*ne
;
742 KKASSERT(he
!= NULL
&& he
->map
!= NULL
&& he
->cmd_ranges
!= NULL
&&
743 he
->subsys
!= NULL
&& *he
->subsys
!= '\0');
745 ne
= kmalloc(sizeof(struct ioctl_map_entry
), M_IOCTLMAP
,
748 ne
->subsys
= he
->subsys
;
749 ne
->cmd_ranges
= he
->cmd_ranges
;
751 lwkt_gettoken(&mioctl_token
);
752 LIST_INSERT_HEAD(&he
->map
->mapping
, ne
, entries
);
753 lwkt_reltoken(&mioctl_token
);
762 mapped_ioctl_unregister_handler(struct ioctl_map_handler
*he
)
764 struct ioctl_map_entry
*ne
;
767 KKASSERT(he
!= NULL
&& he
->map
!= NULL
&& he
->cmd_ranges
!= NULL
);
769 lwkt_gettoken(&mioctl_token
);
770 LIST_FOREACH(ne
, &he
->map
->mapping
, entries
) {
771 if (ne
->cmd_ranges
== he
->cmd_ranges
) {
772 LIST_REMOVE(ne
, entries
);
773 kfree(ne
, M_IOCTLMAP
);
778 lwkt_reltoken(&mioctl_token
);
782 static int nseldebug
;
783 SYSCTL_INT(_kern
, OID_AUTO
, nseldebug
, CTLFLAG_RW
, &nseldebug
, 0, "");
786 * Select system call.
791 sys_select(struct sysmsg
*sysmsg
, const struct select_args
*uap
)
794 struct timespec
*ktsp
, kts
;
798 * Get timeout if any.
800 if (uap
->tv
!= NULL
) {
801 error
= copyin(uap
->tv
, &ktv
, sizeof (ktv
));
804 TIMEVAL_TO_TIMESPEC(&ktv
, &kts
);
813 error
= doselect(uap
->nd
, uap
->in
, uap
->ou
, uap
->ex
, ktsp
,
814 &sysmsg
->sysmsg_result
);
821 * Pselect system call.
824 sys_pselect(struct sysmsg
*sysmsg
, const struct pselect_args
*uap
)
826 struct thread
*td
= curthread
;
827 struct lwp
*lp
= td
->td_lwp
;
828 struct timespec
*ktsp
, kts
;
833 * Get timeout if any.
835 if (uap
->ts
!= NULL
) {
836 error
= copyin(uap
->ts
, &kts
, sizeof (kts
));
845 * Install temporary signal mask if any provided.
847 if (uap
->sigmask
!= NULL
) {
848 error
= copyin(uap
->sigmask
, &sigmask
, sizeof(sigmask
));
851 lwkt_gettoken(&lp
->lwp_proc
->p_token
);
852 lp
->lwp_oldsigmask
= lp
->lwp_sigmask
;
853 SIG_CANTMASK(sigmask
);
854 lp
->lwp_sigmask
= sigmask
;
855 lwkt_reltoken(&lp
->lwp_proc
->p_token
);
861 error
= doselect(uap
->nd
, uap
->in
, uap
->ou
, uap
->ex
, ktsp
,
862 &sysmsg
->sysmsg_result
);
864 if (uap
->sigmask
!= NULL
) {
865 lwkt_gettoken(&lp
->lwp_proc
->p_token
);
866 /* doselect() responsible for turning ERESTART into EINTR */
867 KKASSERT(error
!= ERESTART
);
868 if (error
== EINTR
) {
870 * We can't restore the previous signal mask now
871 * because it could block the signal that interrupted
872 * us. So make a note to restore it after executing
875 lp
->lwp_flags
|= LWP_OLDMASK
;
878 * No handler to run. Restore previous mask immediately.
880 lp
->lwp_sigmask
= lp
->lwp_oldsigmask
;
882 lwkt_reltoken(&lp
->lwp_proc
->p_token
);
889 select_copyin(void *arg
, struct kevent
*kevp
, int maxevents
, int *events
)
891 struct select_kevent_copyin_args
*skap
= NULL
;
898 skap
= (struct select_kevent_copyin_args
*)arg
;
900 if (*events
== maxevents
)
903 while (skap
->active_set
< COPYIN_DONE
) {
904 switch (skap
->active_set
) {
907 * Register descriptors for the read filter
909 fdp
= skap
->read_set
;
910 filter
= EVFILT_READ
;
911 fflags
= NOTE_OLDAPI
;
919 * Register descriptors for the write filter
921 fdp
= skap
->write_set
;
922 filter
= EVFILT_WRITE
;
923 fflags
= NOTE_OLDAPI
;
931 * Register descriptors for the exception filter
933 fdp
= skap
->except_set
;
934 filter
= EVFILT_EXCEPT
;
935 fflags
= NOTE_OLDAPI
| NOTE_OOB
;
943 * Nothing left to register
949 while (skap
->proc_fds
< skap
->num_fds
) {
951 if (FD_ISSET(fd
, fdp
)) {
952 kev
= &kevp
[*events
];
953 EV_SET(kev
, fd
, filter
,
957 skap
->lwp
->lwp_kqueue_serial
);
962 kprintf("select fd %d filter %d "
963 "serial %ju\n", fd
, filter
,
965 skap
->lwp
->lwp_kqueue_serial
);
969 if (*events
== maxevents
)
980 select_copyout(void *arg
, struct kevent
*kevp
, int count
, int *res
)
982 struct select_kevent_copyin_args
*skap
;
987 skap
= (struct select_kevent_copyin_args
*)arg
;
989 for (i
= 0; i
< count
; ++i
) {
991 * Filter out and delete spurious events
993 if ((uint64_t)(uintptr_t)kevp
[i
].udata
!=
994 skap
->lwp
->lwp_kqueue_serial
)
996 panic("select_copyout: unexpected udata");
999 kev
.flags
= EV_DISABLE
|EV_DELETE
;
1001 kqueue_register(&skap
->lwp
->lwp_kqueue
, &kev
, &n
);
1003 kprintf("select fd %ju mismatched serial %ju\n",
1004 (uintmax_t)kevp
[i
].ident
,
1005 (uintmax_t)skap
->lwp
->lwp_kqueue_serial
);
1013 if (kevp
[i
].flags
& EV_ERROR
) {
1014 int error
= kevp
[i
].data
;
1019 * A bad file descriptor is considered a
1020 * fatal error for select, bail out.
1022 skap
->error
= error
;
1028 * Select silently swallows any unknown errors
1029 * for descriptors in the read or write sets.
1031 * ALWAYS filter out EOPNOTSUPP errors from
1032 * filters (at least until all filters support
1035 * We also filter out ENODEV since dev_dkqfilter
1036 * returns ENODEV if EOPNOTSUPP is returned in an
1041 if (kevp
[i
].filter
!= EVFILT_READ
&&
1042 kevp
[i
].filter
!= EVFILT_WRITE
&&
1043 error
!= EOPNOTSUPP
&&
1045 skap
->error
= error
;
1053 * We must deregister any unsupported select events
1054 * to avoid a live-lock.
1057 kprintf("select fd %ju filter %d error %d\n",
1058 (uintmax_t)kevp
[i
].ident
,
1059 kevp
[i
].filter
, error
);
1064 switch (kevp
[i
].filter
) {
1066 FD_SET(kevp
[i
].ident
, skap
->read_set
);
1069 FD_SET(kevp
[i
].ident
, skap
->write_set
);
1072 FD_SET(kevp
[i
].ident
, skap
->except_set
);
1083 * Copy select bits in from userland. Allocate kernel memory if the
1087 getbits(int bytes
, fd_set
*in_set
, kfd_set
**out_set
, kfd_set
*tmp_set
)
1092 if (bytes
< sizeof(*tmp_set
))
1095 *out_set
= kmalloc(bytes
, M_SELECT
, M_WAITOK
);
1096 error
= copyin(in_set
, *out_set
, bytes
);
1105 * Copy returned select bits back out to userland.
1108 putbits(int bytes
, kfd_set
*in_set
, fd_set
*out_set
)
1113 error
= copyout(in_set
, out_set
, bytes
);
1121 dotimeout_only(struct timespec
*ts
)
1123 return(nanosleep1(ts
, NULL
));
1127 * Common code for sys_select() and sys_pselect().
1129 * in, out and ex are userland pointers. ts must point to validated
1130 * kernel-side timeout value or NULL for infinite timeout. res must
1131 * point to syscall return value.
1134 doselect(int nd
, fd_set
*read
, fd_set
*write
, fd_set
*except
,
1135 struct timespec
*ts
, int *res
)
1137 struct proc
*p
= curproc
;
1138 struct select_kevent_copyin_args
*kap
, ka
;
1148 return (dotimeout_only(ts
));
1150 if (nd
> p
->p_fd
->fd_nfiles
) /* limit kmalloc */
1151 nd
= p
->p_fd
->fd_nfiles
;
1154 kap
->lwp
= curthread
->td_lwp
;
1158 kap
->active_set
= COPYIN_READ
;
1161 * Calculate bytes based on the number of __fd_mask[] array entries
1162 * multiplied by the size of __fd_mask.
1164 bytes
= howmany(nd
, __NFDBITS
) * sizeof(__fd_mask
);
1166 /* kap->read_set = NULL; not needed */
1167 kap
->write_set
= NULL
;
1168 kap
->except_set
= NULL
;
1170 error
= getbits(bytes
, read
, &kap
->read_set
, &read_tmp
);
1172 error
= getbits(bytes
, write
, &kap
->write_set
, &write_tmp
);
1174 error
= getbits(bytes
, except
, &kap
->except_set
, &except_tmp
);
1179 * NOTE: Make sure the max events passed to kern_kevent() is
1180 * effectively unlimited. (nd * 3) accomplishes this.
1182 * (*res) continues to increment as returned events are
1185 error
= kern_kevent(&kap
->lwp
->lwp_kqueue
, 0x7FFFFFFF, res
, kap
,
1186 select_copyin
, select_copyout
, ts
,
1189 error
= putbits(bytes
, kap
->read_set
, read
);
1191 error
= putbits(bytes
, kap
->write_set
, write
);
1193 error
= putbits(bytes
, kap
->except_set
, except
);
1196 * An error from an individual event that should be passed
1197 * back to userland (EBADF)
1206 if (kap
->read_set
&& kap
->read_set
!= &read_tmp
)
1207 kfree(kap
->read_set
, M_SELECT
);
1208 if (kap
->write_set
&& kap
->write_set
!= &write_tmp
)
1209 kfree(kap
->write_set
, M_SELECT
);
1210 if (kap
->except_set
&& kap
->except_set
!= &except_tmp
)
1211 kfree(kap
->except_set
, M_SELECT
);
1213 kap
->lwp
->lwp_kqueue_serial
+= kap
->num_fds
;
1224 sys_poll(struct sysmsg
*sysmsg
, const struct poll_args
*uap
)
1226 struct timespec ts
, *tsp
;
1229 if (uap
->timeout
!= INFTIM
) {
1230 if (uap
->timeout
< 0)
1232 ts
.tv_sec
= uap
->timeout
/ 1000;
1233 ts
.tv_nsec
= (uap
->timeout
% 1000) * 1000 * 1000;
1239 error
= dopoll(uap
->nfds
, uap
->fds
, tsp
, &sysmsg
->sysmsg_result
, 0);
1245 * Ppoll system call.
1250 sys_ppoll(struct sysmsg
*sysmsg
, const struct ppoll_args
*uap
)
1252 struct thread
*td
= curthread
;
1253 struct lwp
*lp
= td
->td_lwp
;
1254 struct timespec
*ktsp
, kts
;
1259 * Get timeout if any.
1261 if (uap
->ts
!= NULL
) {
1262 error
= copyin(uap
->ts
, &kts
, sizeof (kts
));
1271 * Install temporary signal mask if any provided.
1273 if (uap
->sigmask
!= NULL
) {
1274 error
= copyin(uap
->sigmask
, &sigmask
, sizeof(sigmask
));
1277 lwkt_gettoken(&lp
->lwp_proc
->p_token
);
1278 lp
->lwp_oldsigmask
= lp
->lwp_sigmask
;
1279 SIG_CANTMASK(sigmask
);
1280 lp
->lwp_sigmask
= sigmask
;
1281 lwkt_reltoken(&lp
->lwp_proc
->p_token
);
1284 error
= dopoll(uap
->nfds
, uap
->fds
, ktsp
, &sysmsg
->sysmsg_result
,
1285 ktsp
!= NULL
? KEVENT_TIMEOUT_PRECISE
: 0);
1287 if (uap
->sigmask
!= NULL
) {
1288 lwkt_gettoken(&lp
->lwp_proc
->p_token
);
1289 /* dopoll() responsible for turning ERESTART into EINTR */
1290 KKASSERT(error
!= ERESTART
);
1291 if (error
== EINTR
) {
1293 * We can't restore the previous signal mask now
1294 * because it could block the signal that interrupted
1295 * us. So make a note to restore it after executing
1298 lp
->lwp_flags
|= LWP_OLDMASK
;
1301 * No handler to run. Restore previous mask immediately.
1303 lp
->lwp_sigmask
= lp
->lwp_oldsigmask
;
1305 lwkt_reltoken(&lp
->lwp_proc
->p_token
);
1312 poll_copyin(void *arg
, struct kevent
*kevp
, int maxevents
, int *events
)
1314 struct poll_kevent_copyin_args
*pkap
;
1319 pkap
= (struct poll_kevent_copyin_args
*)arg
;
1321 while (pkap
->pfds
< pkap
->nfds
) {
1322 pfd
= &pkap
->fds
[pkap
->pfds
];
1324 /* Clear return events */
1327 /* Do not check if fd is equal to -1 */
1328 if (pfd
->fd
== -1) {
1334 if (pfd
->events
& (POLLIN
| POLLHUP
| POLLRDNORM
))
1336 if (pfd
->events
& (POLLOUT
| POLLWRNORM
))
1338 if (pfd
->events
& (POLLPRI
| POLLRDBAND
))
1341 if (*events
+ kev_count
> maxevents
)
1345 * NOTE: A combined serial number and poll array index is
1346 * stored in kev->udata.
1348 kev
= &kevp
[*events
];
1349 if (pfd
->events
& (POLLIN
| POLLHUP
| POLLRDNORM
)) {
1350 int notes
= NOTE_OLDAPI
;
1351 if ((pfd
->events
& (POLLIN
| POLLRDNORM
)) == 0)
1352 notes
|= NOTE_HUPONLY
;
1354 EV_SET(kev
++, pfd
->fd
, EVFILT_READ
, EV_ADD
|EV_ENABLE
,
1355 notes
, 0, (void *)(uintptr_t)
1356 (pkap
->lwp
->lwp_kqueue_serial
+ pkap
->pfds
));
1358 if (pfd
->events
& (POLLOUT
| POLLWRNORM
)) {
1359 EV_SET(kev
++, pfd
->fd
, EVFILT_WRITE
, EV_ADD
|EV_ENABLE
,
1360 NOTE_OLDAPI
, 0, (void *)(uintptr_t)
1361 (pkap
->lwp
->lwp_kqueue_serial
+ pkap
->pfds
));
1363 if (pfd
->events
& (POLLPRI
| POLLRDBAND
)) {
1364 EV_SET(kev
++, pfd
->fd
, EVFILT_EXCEPT
, EV_ADD
|EV_ENABLE
,
1365 NOTE_OLDAPI
| NOTE_OOB
, 0,
1367 (pkap
->lwp
->lwp_kqueue_serial
+ pkap
->pfds
));
1371 kprintf("poll index %d/%d fd %d events %08x "
1372 "serial %ju\n", pkap
->pfds
, pkap
->nfds
-1,
1373 pfd
->fd
, pfd
->events
,
1374 (uintmax_t)pkap
->lwp
->lwp_kqueue_serial
);
1378 (*events
) += kev_count
;
1385 poll_copyout(void *arg
, struct kevent
*kevp
, int count
, int *res
)
1387 struct poll_kevent_copyin_args
*pkap
;
1395 pkap
= (struct poll_kevent_copyin_args
*)arg
;
1397 for (i
= 0; i
< count
; ++i
) {
1399 * Extract the poll array index and delete spurious events.
1400 * We can easily tell if the serial number is incorrect
1401 * by checking whether the extracted index is out of range.
1403 pi
= (uint64_t)(uintptr_t)kevp
[i
].udata
-
1404 pkap
->lwp
->lwp_kqueue_serial
;
1405 if (pi
>= pkap
->nfds
) {
1406 panic("poll_copyout: unexpected udata");
1409 kev
.flags
= EV_DISABLE
|EV_DELETE
;
1411 kqueue_register(&pkap
->lwp
->lwp_kqueue
, &kev
, &n
);
1413 kprintf("poll index %ju out of range against "
1414 "serial %ju\n", (uintmax_t)pi
,
1415 (uintmax_t)pkap
->lwp
->lwp_kqueue_serial
);
1421 * Locate the pollfd and process events
1423 pfd
= &pkap
->fds
[pi
];
1424 if (kevp
[i
].ident
== pfd
->fd
) {
1426 * A single descriptor may generate an error against
1427 * more than one filter, make sure to set the
1428 * appropriate flags but do not increment (*res)
1431 count_res
= (pfd
->revents
== 0);
1432 if (kevp
[i
].flags
& EV_ERROR
) {
1433 switch(kevp
[i
].data
) {
1436 /* Bad file descriptor */
1439 pfd
->revents
|= POLLNVAL
;
1443 * Poll silently swallows any unknown
1444 * errors except in the case of POLLPRI
1445 * (OOB/urgent data).
1447 * ALWAYS filter out EOPNOTSUPP errors
1448 * from filters, common applications
1449 * set POLLPRI|POLLRDBAND and most
1450 * filters do not support EVFILT_EXCEPT.
1452 * We also filter out ENODEV since
1453 * dev_dkqfilter returns ENODEV if
1454 * EOPNOTSUPP is returned in an
1459 if (kevp
[i
].filter
!= EVFILT_READ
&&
1460 kevp
[i
].filter
!= EVFILT_WRITE
&&
1461 kevp
[i
].data
!= EOPNOTSUPP
&&
1462 kevp
[i
].data
!= ENODEV
) {
1465 pfd
->revents
|= POLLERR
;
1469 if (pfd
->revents
== 0 && nseldebug
) {
1470 kprintf("poll index EV_ERROR %ju fd %d "
1471 "filter %d error %jd\n",
1472 (uintmax_t)pi
, pfd
->fd
,
1474 (intmax_t)kevp
[i
].data
);
1478 * Silently deregister any unhandled EV_ERROR
1479 * condition (usually EOPNOTSUPP).
1481 if (pfd
->revents
== 0)
1486 switch (kevp
[i
].filter
) {
1489 * NODATA on the read side can indicate a
1490 * half-closed situation and not necessarily
1491 * a disconnect, so depend on the user
1492 * issuing a read() and getting 0 bytes back.
1494 * If EV_HUP is set the peer completely
1495 * disconnected and we can set POLLHUP.
1496 * Linux can return POLLHUP even if read
1497 * data has not been drained, so we should
1500 /* if (kevp[i].flags & EV_NODATA) */ {
1501 if (kevp
[i
].flags
& EV_HUP
)
1502 pfd
->revents
|= POLLHUP
;
1504 if ((kevp
[i
].flags
& EV_EOF
) &&
1505 kevp
[i
].fflags
!= 0)
1506 pfd
->revents
|= POLLERR
;
1507 if (pfd
->events
& POLLIN
)
1508 pfd
->revents
|= POLLIN
;
1509 if (pfd
->events
& POLLRDNORM
)
1510 pfd
->revents
|= POLLRDNORM
;
1514 * As per the OpenGroup POLLHUP is mutually
1515 * exclusive with the writability flags. I
1516 * consider this a bit broken but...
1518 * In this case a disconnect is implied even
1519 * for a half-closed (write side) situation.
1521 if (kevp
[i
].flags
& EV_EOF
) {
1522 pfd
->revents
|= POLLHUP
;
1523 if (kevp
[i
].fflags
!= 0)
1524 pfd
->revents
|= POLLERR
;
1526 if (pfd
->events
& POLLOUT
)
1527 pfd
->revents
|= POLLOUT
;
1528 if (pfd
->events
& POLLWRNORM
)
1529 pfd
->revents
|= POLLWRNORM
;
1534 * EV_NODATA should never be tagged for this
1537 if (pfd
->events
& POLLPRI
)
1538 pfd
->revents
|= POLLPRI
;
1539 if (pfd
->events
& POLLRDBAND
)
1540 pfd
->revents
|= POLLRDBAND
;
1545 kprintf("poll index %ju/%d fd %d "
1546 "revents %08x\n", (uintmax_t)pi
, pkap
->nfds
,
1547 pfd
->fd
, pfd
->revents
);
1550 if (count_res
&& pfd
->revents
)
1555 * We must deregister any kqueue poll event that does not
1556 * set poll return bits to prevent a live-lock.
1558 if (pfd
->revents
== 0) {
1559 kprintf("poll index %ju no-action %ju/%d "
1560 "events=%08x kevpfilt=%d/%08x\n",
1561 (uintmax_t)pi
, (uintmax_t)kevp
[i
].ident
,
1562 pfd
->fd
, pfd
->events
,
1563 kevp
[i
].filter
, kevp
[i
].flags
);
1572 dopoll(int nfds
, struct pollfd
*fds
, struct timespec
*ts
, int *res
, int flags
)
1574 struct poll_kevent_copyin_args ka
;
1575 struct pollfd sfds
[64];
1579 flags
|= KEVENT_AUTO_STALE
;
1585 if (nfds
== 0 && ts
)
1586 return (dotimeout_only(ts
));
1589 * This is a bit arbitrary but we need to limit internal kmallocs.
1591 if (nfds
> maxfilesperproc
* 2)
1592 nfds
= maxfilesperproc
* 2;
1593 bytes
= sizeof(struct pollfd
) * nfds
;
1595 ka
.lwp
= curthread
->td_lwp
;
1603 ka
.fds
= kmalloc(bytes
, M_SELECT
, M_WAITOK
);
1605 error
= copyin(fds
, ka
.fds
, bytes
);
1607 error
= kern_kevent(&ka
.lwp
->lwp_kqueue
, 0x7FFFFFFF, res
, &ka
,
1608 poll_copyin
, poll_copyout
, ts
, flags
);
1611 error
= copyout(ka
.fds
, fds
, bytes
);
1614 kfree(ka
.fds
, M_SELECT
);
1616 ka
.lwp
->lwp_kqueue_serial
+= nfds
;
1622 socket_wait_copyin(void *arg
, struct kevent
*kevp
, int maxevents
, int *events
)
1628 socket_wait_copyout(void *arg
, struct kevent
*kevp
, int count
, int *res
)
1634 extern struct fileops socketops
;
1637 * NOTE: Callers of socket_wait() must already have a reference on the
1641 socket_wait(struct socket
*so
, struct timespec
*ts
, int *res
)
1643 struct thread
*td
= curthread
;
1650 if ((error
= falloc(td
->td_lwp
, &fp
, &fd
)) != 0)
1653 fp
->f_type
= DTYPE_SOCKET
;
1654 fp
->f_flag
= FREAD
| FWRITE
;
1655 fp
->f_ops
= &socketops
;
1657 fsetfd(td
->td_lwp
->lwp_proc
->p_fd
, fp
, fd
);
1658 fsetfdflags(td
->td_proc
->p_fd
, fd
, UF_EXCLOSE
);
1660 bzero(&kq
, sizeof(kq
));
1661 kqueue_init(&kq
, td
->td_lwp
->lwp_proc
->p_fd
);
1662 EV_SET(&kev
, fd
, EVFILT_READ
, EV_ADD
|EV_ENABLE
, 0, 0, NULL
);
1664 if ((error
= kqueue_register(&kq
, &kev
, &n
)) != 0) {
1669 error
= kern_kevent(&kq
, 1, res
, NULL
, socket_wait_copyin
,
1670 socket_wait_copyout
, ts
, 0);
1672 EV_SET(&kev
, fd
, EVFILT_READ
, EV_DELETE
|EV_DISABLE
, 0, 0, NULL
);
1674 kqueue_register(&kq
, &kev
, &n
);
1675 fp
->f_ops
= &badfileops
;
1682 * OpenBSD poll system call.
1683 * XXX this isn't quite a true representation.. OpenBSD uses select ops.
1688 sys_openbsd_poll(struct sysmsg
*sysmsg
, const struct openbsd_poll_args
*uap
)
1690 return (sys_poll(sysmsg
, (const struct poll_args
*)uap
));
1695 seltrue(cdev_t dev
, int events
)
1697 return (events
& (POLLIN
| POLLOUT
| POLLRDNORM
| POLLWRNORM
));