2 * Copyright (c) 1982, 1986, 1989, 1993
3 * The Regents of the University of California. All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 * must display the following acknowledgement:
20 * This product includes software developed by the University of
21 * California, Berkeley and its contributors.
22 * 4. Neither the name of the University nor the names of its contributors
23 * may be used to endorse or promote products derived from this software
24 * without specific prior written permission.
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38 * @(#)sys_generic.c 8.5 (Berkeley) 1/21/94
39 * $FreeBSD: src/sys/kern/sys_generic.c,v 1.55.2.10 2001/03/17 10:39:32 peter Exp $
40 * $DragonFly: src/sys/kern/sys_generic.c,v 1.49 2008/05/05 22:09:44 dillon Exp $
43 #include "opt_ktrace.h"
45 #include <sys/param.h>
46 #include <sys/systm.h>
47 #include <sys/sysproto.h>
48 #include <sys/filedesc.h>
49 #include <sys/filio.h>
50 #include <sys/fcntl.h>
53 #include <sys/signalvar.h>
54 #include <sys/socketvar.h>
56 #include <sys/kernel.h>
57 #include <sys/kern_syscall.h>
58 #include <sys/malloc.h>
59 #include <sys/mapped_ioctl.h>
61 #include <sys/queue.h>
62 #include <sys/resourcevar.h>
63 #include <sys/sysctl.h>
64 #include <sys/sysent.h>
67 #include <sys/ktrace.h>
70 #include <vm/vm_page.h>
72 #include <sys/file2.h>
73 #include <sys/mplock2.h>
75 #include <machine/limits.h>
77 static MALLOC_DEFINE(M_IOCTLOPS
, "ioctlops", "ioctl data buffer");
78 static MALLOC_DEFINE(M_IOCTLMAP
, "ioctlmap", "mapped ioctl handler buffer");
79 static MALLOC_DEFINE(M_SELECT
, "select", "select() buffer");
80 MALLOC_DEFINE(M_IOV
, "iov", "large iov's");
82 static int doselect(int nd
, fd_set
*in
, fd_set
*ou
, fd_set
*ex
,
83 struct timeval
*tv
, int *res
);
84 static int pollscan (struct proc
*, struct pollfd
*, u_int
, int *);
85 static int selscan (struct proc
*, fd_mask
**, fd_mask
**,
87 static int dofileread(int, struct file
*, struct uio
*, int, size_t *);
88 static int dofilewrite(int, struct file
*, struct uio
*, int, size_t *);
96 sys_read(struct read_args
*uap
)
98 struct thread
*td
= curthread
;
103 if ((ssize_t
)uap
->nbyte
< 0)
106 aiov
.iov_base
= uap
->buf
;
107 aiov
.iov_len
= uap
->nbyte
;
108 auio
.uio_iov
= &aiov
;
110 auio
.uio_offset
= -1;
111 auio
.uio_resid
= uap
->nbyte
;
112 auio
.uio_rw
= UIO_READ
;
113 auio
.uio_segflg
= UIO_USERSPACE
;
116 error
= kern_preadv(uap
->fd
, &auio
, 0, &uap
->sysmsg_szresult
);
121 * Positioned (Pread) read system call
126 sys_extpread(struct extpread_args
*uap
)
128 struct thread
*td
= curthread
;
134 if ((ssize_t
)uap
->nbyte
< 0)
137 aiov
.iov_base
= uap
->buf
;
138 aiov
.iov_len
= uap
->nbyte
;
139 auio
.uio_iov
= &aiov
;
141 auio
.uio_offset
= uap
->offset
;
142 auio
.uio_resid
= uap
->nbyte
;
143 auio
.uio_rw
= UIO_READ
;
144 auio
.uio_segflg
= UIO_USERSPACE
;
147 flags
= uap
->flags
& O_FMASK
;
148 if (uap
->offset
!= (off_t
)-1)
151 error
= kern_preadv(uap
->fd
, &auio
, flags
, &uap
->sysmsg_szresult
);
156 * Scatter read system call.
161 sys_readv(struct readv_args
*uap
)
163 struct thread
*td
= curthread
;
165 struct iovec aiov
[UIO_SMALLIOV
], *iov
= NULL
;
168 error
= iovec_copyin(uap
->iovp
, &iov
, aiov
, uap
->iovcnt
,
173 auio
.uio_iovcnt
= uap
->iovcnt
;
174 auio
.uio_offset
= -1;
175 auio
.uio_rw
= UIO_READ
;
176 auio
.uio_segflg
= UIO_USERSPACE
;
179 error
= kern_preadv(uap
->fd
, &auio
, 0, &uap
->sysmsg_szresult
);
181 iovec_free(&iov
, aiov
);
187 * Scatter positioned read system call.
192 sys_extpreadv(struct extpreadv_args
*uap
)
194 struct thread
*td
= curthread
;
196 struct iovec aiov
[UIO_SMALLIOV
], *iov
= NULL
;
200 error
= iovec_copyin(uap
->iovp
, &iov
, aiov
, uap
->iovcnt
,
205 auio
.uio_iovcnt
= uap
->iovcnt
;
206 auio
.uio_offset
= uap
->offset
;
207 auio
.uio_rw
= UIO_READ
;
208 auio
.uio_segflg
= UIO_USERSPACE
;
211 flags
= uap
->flags
& O_FMASK
;
212 if (uap
->offset
!= (off_t
)-1)
215 error
= kern_preadv(uap
->fd
, &auio
, flags
, &uap
->sysmsg_szresult
);
217 iovec_free(&iov
, aiov
);
225 kern_preadv(int fd
, struct uio
*auio
, int flags
, size_t *res
)
227 struct thread
*td
= curthread
;
228 struct proc
*p
= td
->td_proc
;
234 fp
= holdfp(p
->p_fd
, fd
, FREAD
);
237 if (flags
& O_FOFFSET
&& fp
->f_type
!= DTYPE_VNODE
) {
240 error
= dofileread(fd
, fp
, auio
, flags
, res
);
247 * Common code for readv and preadv that reads data in
248 * from a file using the passed in uio, offset, and flags.
250 * MPALMOSTSAFE - ktrace needs help
253 dofileread(int fd
, struct file
*fp
, struct uio
*auio
, int flags
, size_t *res
)
255 struct thread
*td
= curthread
;
259 struct iovec
*ktriov
= NULL
;
265 * if tracing, save a copy of iovec
267 if (KTRPOINT(td
, KTR_GENIO
)) {
268 int iovlen
= auio
->uio_iovcnt
* sizeof(struct iovec
);
270 MALLOC(ktriov
, struct iovec
*, iovlen
, M_TEMP
, M_WAITOK
);
271 bcopy((caddr_t
)auio
->uio_iov
, (caddr_t
)ktriov
, iovlen
);
275 len
= auio
->uio_resid
;
276 error
= fo_read(fp
, auio
, fp
->f_cred
, flags
);
278 if (auio
->uio_resid
!= len
&& (error
== ERESTART
||
279 error
== EINTR
|| error
== EWOULDBLOCK
))
283 if (ktriov
!= NULL
) {
285 ktruio
.uio_iov
= ktriov
;
286 ktruio
.uio_resid
= len
- auio
->uio_resid
;
288 ktrgenio(td
->td_lwp
, fd
, UIO_READ
, &ktruio
, error
);
291 FREE(ktriov
, M_TEMP
);
295 *res
= len
- auio
->uio_resid
;
306 sys_write(struct write_args
*uap
)
308 struct thread
*td
= curthread
;
313 if ((ssize_t
)uap
->nbyte
< 0)
316 aiov
.iov_base
= (void *)(uintptr_t)uap
->buf
;
317 aiov
.iov_len
= uap
->nbyte
;
318 auio
.uio_iov
= &aiov
;
320 auio
.uio_offset
= -1;
321 auio
.uio_resid
= uap
->nbyte
;
322 auio
.uio_rw
= UIO_WRITE
;
323 auio
.uio_segflg
= UIO_USERSPACE
;
326 error
= kern_pwritev(uap
->fd
, &auio
, 0, &uap
->sysmsg_szresult
);
337 sys_extpwrite(struct extpwrite_args
*uap
)
339 struct thread
*td
= curthread
;
345 if ((ssize_t
)uap
->nbyte
< 0)
348 aiov
.iov_base
= (void *)(uintptr_t)uap
->buf
;
349 aiov
.iov_len
= uap
->nbyte
;
350 auio
.uio_iov
= &aiov
;
352 auio
.uio_offset
= uap
->offset
;
353 auio
.uio_resid
= uap
->nbyte
;
354 auio
.uio_rw
= UIO_WRITE
;
355 auio
.uio_segflg
= UIO_USERSPACE
;
358 flags
= uap
->flags
& O_FMASK
;
359 if (uap
->offset
!= (off_t
)-1)
361 error
= kern_pwritev(uap
->fd
, &auio
, flags
, &uap
->sysmsg_szresult
);
369 sys_writev(struct writev_args
*uap
)
371 struct thread
*td
= curthread
;
373 struct iovec aiov
[UIO_SMALLIOV
], *iov
= NULL
;
376 error
= iovec_copyin(uap
->iovp
, &iov
, aiov
, uap
->iovcnt
,
381 auio
.uio_iovcnt
= uap
->iovcnt
;
382 auio
.uio_offset
= -1;
383 auio
.uio_rw
= UIO_WRITE
;
384 auio
.uio_segflg
= UIO_USERSPACE
;
387 error
= kern_pwritev(uap
->fd
, &auio
, 0, &uap
->sysmsg_szresult
);
389 iovec_free(&iov
, aiov
);
395 * Gather positioned write system call
400 sys_extpwritev(struct extpwritev_args
*uap
)
402 struct thread
*td
= curthread
;
404 struct iovec aiov
[UIO_SMALLIOV
], *iov
= NULL
;
408 error
= iovec_copyin(uap
->iovp
, &iov
, aiov
, uap
->iovcnt
,
413 auio
.uio_iovcnt
= uap
->iovcnt
;
414 auio
.uio_offset
= uap
->offset
;
415 auio
.uio_rw
= UIO_WRITE
;
416 auio
.uio_segflg
= UIO_USERSPACE
;
419 flags
= uap
->flags
& O_FMASK
;
420 if (uap
->offset
!= (off_t
)-1)
423 error
= kern_pwritev(uap
->fd
, &auio
, flags
, &uap
->sysmsg_szresult
);
425 iovec_free(&iov
, aiov
);
433 kern_pwritev(int fd
, struct uio
*auio
, int flags
, size_t *res
)
435 struct thread
*td
= curthread
;
436 struct proc
*p
= td
->td_proc
;
442 fp
= holdfp(p
->p_fd
, fd
, FWRITE
);
445 else if ((flags
& O_FOFFSET
) && fp
->f_type
!= DTYPE_VNODE
) {
448 error
= dofilewrite(fd
, fp
, auio
, flags
, res
);
456 * Common code for writev and pwritev that writes data to
457 * a file using the passed in uio, offset, and flags.
459 * MPALMOSTSAFE - ktrace needs help
462 dofilewrite(int fd
, struct file
*fp
, struct uio
*auio
, int flags
, size_t *res
)
464 struct thread
*td
= curthread
;
465 struct lwp
*lp
= td
->td_lwp
;
469 struct iovec
*ktriov
= NULL
;
475 * if tracing, save a copy of iovec and uio
477 if (KTRPOINT(td
, KTR_GENIO
)) {
478 int iovlen
= auio
->uio_iovcnt
* sizeof(struct iovec
);
480 MALLOC(ktriov
, struct iovec
*, iovlen
, M_TEMP
, M_WAITOK
);
481 bcopy((caddr_t
)auio
->uio_iov
, (caddr_t
)ktriov
, iovlen
);
485 len
= auio
->uio_resid
;
486 error
= fo_write(fp
, auio
, fp
->f_cred
, flags
);
488 if (auio
->uio_resid
!= len
&& (error
== ERESTART
||
489 error
== EINTR
|| error
== EWOULDBLOCK
))
491 /* Socket layer is responsible for issuing SIGPIPE. */
492 if (error
== EPIPE
) {
494 lwpsignal(lp
->lwp_proc
, lp
, SIGPIPE
);
499 if (ktriov
!= NULL
) {
501 ktruio
.uio_iov
= ktriov
;
502 ktruio
.uio_resid
= len
- auio
->uio_resid
;
504 ktrgenio(lp
, fd
, UIO_WRITE
, &ktruio
, error
);
507 FREE(ktriov
, M_TEMP
);
511 *res
= len
- auio
->uio_resid
;
522 sys_ioctl(struct ioctl_args
*uap
)
527 error
= mapped_ioctl(uap
->fd
, uap
->com
, uap
->data
, NULL
, &uap
->sysmsg
);
532 struct ioctl_map_entry
{
534 struct ioctl_map_range
*cmd_ranges
;
535 LIST_ENTRY(ioctl_map_entry
) entries
;
539 * The true heart of all ioctl syscall handlers (native, emulation).
540 * If map != NULL, it will be searched for a matching entry for com,
541 * and appropriate conversions/conversion functions will be utilized.
544 mapped_ioctl(int fd
, u_long com
, caddr_t uspc_data
, struct ioctl_map
*map
,
547 struct thread
*td
= curthread
;
548 struct proc
*p
= td
->td_proc
;
551 struct ioctl_map_range
*iomc
= NULL
;
557 #define STK_PARAMS 128
559 char stkbuf
[STK_PARAMS
];
566 fp
= holdfp(p
->p_fd
, fd
, FREAD
|FWRITE
);
570 if (map
!= NULL
) { /* obey translation map */
572 struct ioctl_map_entry
*e
;
574 maskcmd
= com
& map
->mask
;
576 LIST_FOREACH(e
, &map
->mapping
, entries
) {
577 for (iomc
= e
->cmd_ranges
; iomc
->start
!= 0 ||
578 iomc
->maptocmd
!= 0 || iomc
->wrapfunc
!= NULL
||
579 iomc
->mapfunc
!= NULL
;
581 if (maskcmd
>= iomc
->start
&&
582 maskcmd
<= iomc
->end
)
586 /* Did we find a match? */
587 if (iomc
->start
!= 0 || iomc
->maptocmd
!= 0 ||
588 iomc
->wrapfunc
!= NULL
|| iomc
->mapfunc
!= NULL
)
593 (iomc
->start
== 0 && iomc
->maptocmd
== 0
594 && iomc
->wrapfunc
== NULL
&& iomc
->mapfunc
== NULL
)) {
595 kprintf("%s: 'ioctl' fd=%d, cmd=0x%lx ('%c',%d) not implemented\n",
596 map
->sys
, fd
, maskcmd
,
597 (int)((maskcmd
>> 8) & 0xff),
598 (int)(maskcmd
& 0xff));
604 * If it's a non-range one to one mapping, maptocmd should be
605 * correct. If it's a ranged one to one mapping, we pass the
606 * original value of com, and for a range mapped to a different
607 * range, we always need a mapping function to translate the
608 * ioctl to our native ioctl. Ex. 6500-65ff <-> 9500-95ff
610 if (iomc
->start
== iomc
->end
&& iomc
->maptocmd
== iomc
->maptoend
) {
611 com
= iomc
->maptocmd
;
612 } else if (iomc
->start
== iomc
->maptocmd
&& iomc
->end
== iomc
->maptoend
) {
613 if (iomc
->mapfunc
!= NULL
)
614 com
= iomc
->mapfunc(iomc
->start
, iomc
->end
,
615 iomc
->start
, iomc
->end
,
618 if (iomc
->mapfunc
!= NULL
) {
619 com
= iomc
->mapfunc(iomc
->start
, iomc
->end
,
620 iomc
->maptocmd
, iomc
->maptoend
,
623 kprintf("%s: Invalid mapping for fd=%d, cmd=%#lx ('%c',%d)\n",
624 map
->sys
, fd
, maskcmd
,
625 (int)((maskcmd
>> 8) & 0xff),
626 (int)(maskcmd
& 0xff));
635 error
= fclrfdflags(p
->p_fd
, fd
, UF_EXCLOSE
);
638 error
= fsetfdflags(p
->p_fd
, fd
, UF_EXCLOSE
);
643 * Interpret high order word to find amount of data to be
644 * copied to/from the user's address space.
646 size
= IOCPARM_LEN(com
);
647 if (size
> IOCPARM_MAX
) {
653 if (size
> sizeof (ubuf
.stkbuf
)) {
654 memp
= kmalloc(size
, M_IOCTLOPS
, M_WAITOK
);
659 if ((com
& IOC_IN
) != 0) {
661 error
= copyin(uspc_data
, data
, (size_t)size
);
664 kfree(memp
, M_IOCTLOPS
);
668 *(caddr_t
*)data
= uspc_data
;
670 } else if ((com
& IOC_OUT
) != 0 && size
) {
672 * Zero the buffer so the user always
673 * gets back something deterministic.
675 bzero(data
, (size_t)size
);
676 } else if ((com
& IOC_VOID
) != 0) {
677 *(caddr_t
*)data
= uspc_data
;
682 if ((tmp
= *(int *)data
))
683 fp
->f_flag
|= FNONBLOCK
;
685 fp
->f_flag
&= ~FNONBLOCK
;
690 if ((tmp
= *(int *)data
))
691 fp
->f_flag
|= FASYNC
;
693 fp
->f_flag
&= ~FASYNC
;
694 error
= fo_ioctl(fp
, FIOASYNC
, (caddr_t
)&tmp
, cred
, msg
);
699 * If there is a override function,
700 * call it instead of directly routing the call
702 if (map
!= NULL
&& iomc
->wrapfunc
!= NULL
)
703 error
= iomc
->wrapfunc(fp
, com
, ocom
, data
, cred
);
705 error
= fo_ioctl(fp
, com
, data
, cred
, msg
);
707 * Copy any data to user, size was
708 * already set and checked above.
710 if (error
== 0 && (com
& IOC_OUT
) != 0 && size
!= 0)
711 error
= copyout(data
, uspc_data
, (size_t)size
);
715 kfree(memp
, M_IOCTLOPS
);
722 mapped_ioctl_register_handler(struct ioctl_map_handler
*he
)
724 struct ioctl_map_entry
*ne
;
726 KKASSERT(he
!= NULL
&& he
->map
!= NULL
&& he
->cmd_ranges
!= NULL
&&
727 he
->subsys
!= NULL
&& *he
->subsys
!= '\0');
729 ne
= kmalloc(sizeof(struct ioctl_map_entry
), M_IOCTLMAP
, M_WAITOK
);
731 ne
->subsys
= he
->subsys
;
732 ne
->cmd_ranges
= he
->cmd_ranges
;
734 LIST_INSERT_HEAD(&he
->map
->mapping
, ne
, entries
);
740 mapped_ioctl_unregister_handler(struct ioctl_map_handler
*he
)
742 struct ioctl_map_entry
*ne
;
744 KKASSERT(he
!= NULL
&& he
->map
!= NULL
&& he
->cmd_ranges
!= NULL
);
746 LIST_FOREACH(ne
, &he
->map
->mapping
, entries
) {
747 if (ne
->cmd_ranges
!= he
->cmd_ranges
)
749 LIST_REMOVE(ne
, entries
);
750 kfree(ne
, M_IOCTLMAP
);
756 static int nselcoll
; /* Select collisions since boot */
758 SYSCTL_INT(_kern
, OID_AUTO
, nselcoll
, CTLFLAG_RD
, &nselcoll
, 0, "");
761 * Select system call.
766 sys_select(struct select_args
*uap
)
769 struct timeval
*ktvp
;
773 * Get timeout if any.
775 if (uap
->tv
!= NULL
) {
776 error
= copyin(uap
->tv
, &ktv
, sizeof (ktv
));
779 error
= itimerfix(&ktv
);
791 error
= doselect(uap
->nd
, uap
->in
, uap
->ou
, uap
->ex
, ktvp
,
792 &uap
->sysmsg_result
);
800 * Pselect system call.
805 sys_pselect(struct pselect_args
*uap
)
807 struct thread
*td
= curthread
;
808 struct lwp
*lp
= td
->td_lwp
;
811 struct timeval
*ktvp
;
816 * Get timeout if any and convert it.
817 * Round up during conversion to avoid timeout going off early.
819 if (uap
->ts
!= NULL
) {
820 error
= copyin(uap
->ts
, &kts
, sizeof (kts
));
823 ktv
.tv_sec
= kts
.tv_sec
;
824 ktv
.tv_usec
= (kts
.tv_nsec
+ 999) / 1000;
825 error
= itimerfix(&ktv
);
834 * Install temporary signal mask if any provided.
836 if (uap
->sigmask
!= NULL
) {
837 error
= copyin(uap
->sigmask
, &sigmask
, sizeof(sigmask
));
841 lp
->lwp_oldsigmask
= lp
->lwp_sigmask
;
842 SIG_CANTMASK(sigmask
);
843 lp
->lwp_sigmask
= sigmask
;
851 error
= doselect(uap
->nd
, uap
->in
, uap
->ou
, uap
->ex
, ktvp
,
852 &uap
->sysmsg_result
);
854 if (uap
->sigmask
!= NULL
) {
855 /* doselect() responsible for turning ERESTART into EINTR */
856 KKASSERT(error
!= ERESTART
);
857 if (error
== EINTR
) {
859 * We can't restore the previous signal mask now
860 * because it could block the signal that interrupted
861 * us. So make a note to restore it after executing
864 lp
->lwp_flag
|= LWP_OLDMASK
;
867 * No handler to run. Restore previous mask immediately.
869 lp
->lwp_sigmask
= lp
->lwp_oldsigmask
;
878 * Common code for sys_select() and sys_pselect().
880 * in, out and ex are userland pointers. tv must point to validated
881 * kernel-side timeout value or NULL for infinite timeout. res must
882 * point to syscall return value.
885 doselect(int nd
, fd_set
*in
, fd_set
*ou
, fd_set
*ex
, struct timeval
*tv
,
888 struct lwp
*lp
= curthread
->td_lwp
;
889 struct proc
*p
= curproc
;
892 * The magic 2048 here is chosen to be just enough for FD_SETSIZE
893 * infds with the new FD_SETSIZE of 1024, and more than enough for
894 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE
897 fd_mask s_selbits
[howmany(2048, NFDBITS
)];
898 fd_mask
*ibits
[3], *obits
[3], *selbits
, *sbp
;
899 struct timeval atv
, rtv
, ttv
;
900 int ncoll
, error
, timo
;
901 u_int nbufbytes
, ncpbytes
, nfdbits
;
905 if (nd
> p
->p_fd
->fd_nfiles
)
906 nd
= p
->p_fd
->fd_nfiles
; /* forgiving; slightly wrong */
909 * Allocate just enough bits for the non-null fd_sets. Use the
910 * preallocated auto buffer if possible.
912 nfdbits
= roundup(nd
, NFDBITS
);
913 ncpbytes
= nfdbits
/ NBBY
;
916 nbufbytes
+= 2 * ncpbytes
;
918 nbufbytes
+= 2 * ncpbytes
;
920 nbufbytes
+= 2 * ncpbytes
;
921 if (nbufbytes
<= sizeof s_selbits
)
922 selbits
= &s_selbits
[0];
924 selbits
= kmalloc(nbufbytes
, M_SELECT
, M_WAITOK
);
927 * Assign pointers into the bit buffers and fetch the input bits.
928 * Put the output buffers together so that they can be bzeroed
932 #define getbits(name, x) \
937 ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp; \
939 sbp += ncpbytes / sizeof *sbp; \
940 error = copyin(name, ibits[x], ncpbytes); \
950 bzero(selbits
, nbufbytes
/ 2);
954 getmicrouptime(&rtv
);
955 timevaladd(&atv
, &rtv
);
963 lp
->lwp_flag
|= LWP_SELECT
;
964 error
= selscan(p
, ibits
, obits
, nd
, res
);
967 if (atv
.tv_sec
|| atv
.tv_usec
) {
968 getmicrouptime(&rtv
);
969 if (timevalcmp(&rtv
, &atv
, >=))
972 timevalsub(&ttv
, &rtv
);
973 timo
= ttv
.tv_sec
> 24 * 60 * 60 ?
974 24 * 60 * 60 * hz
: tvtohz_high(&ttv
);
977 if ((lp
->lwp_flag
& LWP_SELECT
) == 0 || nselcoll
!= ncoll
) {
981 lp
->lwp_flag
&= ~LWP_SELECT
;
983 error
= tsleep((caddr_t
)&selwait
, PCATCH
, "select", timo
);
989 lp
->lwp_flag
&= ~LWP_SELECT
;
990 /* select is not restarted after signals... */
991 if (error
== ERESTART
)
993 if (error
== EWOULDBLOCK
)
995 #define putbits(name, x) \
996 if (name && (error2 = copyout(obits[x], name, ncpbytes))) \
1006 if (selbits
!= &s_selbits
[0])
1007 kfree(selbits
, M_SELECT
);
1012 selscan(struct proc
*p
, fd_mask
**ibits
, fd_mask
**obits
, int nfd
, int *res
)
1018 /* Note: backend also returns POLLHUP/POLLERR if appropriate. */
1019 static int flag
[3] = { POLLRDNORM
, POLLWRNORM
, POLLRDBAND
};
1021 for (msk
= 0; msk
< 3; msk
++) {
1022 if (ibits
[msk
] == NULL
)
1024 for (i
= 0; i
< nfd
; i
+= NFDBITS
) {
1025 bits
= ibits
[msk
][i
/NFDBITS
];
1026 /* ffs(int mask) not portable, fd_mask is long */
1027 for (fd
= i
; bits
&& fd
< nfd
; fd
++, bits
>>= 1) {
1030 fp
= holdfp(p
->p_fd
, fd
, -1);
1033 if (fo_poll(fp
, flag
[msk
], fp
->f_cred
)) {
1034 obits
[msk
][(fd
)/NFDBITS
] |=
1035 ((fd_mask
)1 << ((fd
) % NFDBITS
));
1052 sys_poll(struct poll_args
*uap
)
1054 struct pollfd
*bits
;
1055 struct pollfd smallbits
[32];
1056 struct timeval atv
, rtv
, ttv
;
1057 int ncoll
, error
= 0, timo
;
1060 struct lwp
*lp
= curthread
->td_lwp
;
1061 struct proc
*p
= curproc
;
1065 * This is kinda bogus. We have fd limits, but that is not
1066 * really related to the size of the pollfd array. Make sure
1067 * we let the process use at least FD_SETSIZE entries and at
1068 * least enough for the current limits. We want to be reasonably
1069 * safe, but not overly restrictive.
1071 if (nfds
> p
->p_rlimit
[RLIMIT_NOFILE
].rlim_cur
&& nfds
> FD_SETSIZE
)
1073 ni
= nfds
* sizeof(struct pollfd
);
1074 if (ni
> sizeof(smallbits
))
1075 bits
= kmalloc(ni
, M_TEMP
, M_WAITOK
);
1078 error
= copyin(uap
->fds
, bits
, ni
);
1081 if (uap
->timeout
!= INFTIM
) {
1082 atv
.tv_sec
= uap
->timeout
/ 1000;
1083 atv
.tv_usec
= (uap
->timeout
% 1000) * 1000;
1084 if (itimerfix(&atv
)) {
1088 getmicrouptime(&rtv
);
1089 timevaladd(&atv
, &rtv
);
1097 lp
->lwp_flag
|= LWP_SELECT
;
1099 error
= pollscan(p
, bits
, nfds
, &uap
->sysmsg_result
);
1101 if (error
|| uap
->sysmsg_result
)
1103 if (atv
.tv_sec
|| atv
.tv_usec
) {
1104 getmicrouptime(&rtv
);
1105 if (timevalcmp(&rtv
, &atv
, >=))
1108 timevalsub(&ttv
, &rtv
);
1109 timo
= ttv
.tv_sec
> 24 * 60 * 60 ?
1110 24 * 60 * 60 * hz
: tvtohz_high(&ttv
);
1113 tsleep_interlock(&selwait
, PCATCH
);
1114 if ((lp
->lwp_flag
& LWP_SELECT
) == 0 || nselcoll
!= ncoll
) {
1118 lp
->lwp_flag
&= ~LWP_SELECT
;
1119 error
= tsleep(&selwait
, PCATCH
| PINTERLOCKED
, "poll", timo
);
1124 lp
->lwp_flag
&= ~LWP_SELECT
;
1125 /* poll is not restarted after signals... */
1126 if (error
== ERESTART
)
1128 if (error
== EWOULDBLOCK
)
1131 error
= copyout(bits
, uap
->fds
, ni
);
1136 if (ni
> sizeof(smallbits
))
1137 kfree(bits
, M_TEMP
);
1142 pollscan(struct proc
*p
, struct pollfd
*fds
, u_int nfd
, int *res
)
1148 for (i
= 0; i
< nfd
; i
++, fds
++) {
1149 if (fds
->fd
>= p
->p_fd
->fd_nfiles
) {
1150 fds
->revents
= POLLNVAL
;
1152 } else if (fds
->fd
< 0) {
1155 fp
= holdfp(p
->p_fd
, fds
->fd
, -1);
1157 fds
->revents
= POLLNVAL
;
1161 * Note: backend also returns POLLHUP and
1162 * POLLERR if appropriate.
1164 fds
->revents
= fo_poll(fp
, fds
->events
,
1166 if (fds
->revents
!= 0)
1177 * OpenBSD poll system call.
1178 * XXX this isn't quite a true representation.. OpenBSD uses select ops.
1183 sys_openbsd_poll(struct openbsd_poll_args
*uap
)
1185 return (sys_poll((struct poll_args
*)uap
));
1190 seltrue(cdev_t dev
, int events
)
1192 return (events
& (POLLIN
| POLLOUT
| POLLRDNORM
| POLLWRNORM
));
1196 * Record a select request. A global wait must be used since a process/thread
1197 * might go away after recording its request.
1200 selrecord(struct thread
*selector
, struct selinfo
*sip
)
1203 struct lwp
*lp
= NULL
;
1205 if (selector
->td_lwp
== NULL
)
1206 panic("selrecord: thread needs a process");
1208 if (sip
->si_pid
== selector
->td_proc
->p_pid
&&
1209 sip
->si_tid
== selector
->td_lwp
->lwp_tid
)
1211 if (sip
->si_pid
&& (p
= pfind(sip
->si_pid
)))
1212 lp
= lwp_rb_tree_RB_LOOKUP(&p
->p_lwp_tree
, sip
->si_tid
);
1213 if (lp
!= NULL
&& lp
->lwp_wchan
== (caddr_t
)&selwait
) {
1214 sip
->si_flags
|= SI_COLL
;
1216 sip
->si_pid
= selector
->td_proc
->p_pid
;
1217 sip
->si_tid
= selector
->td_lwp
->lwp_tid
;
1222 * Do a wakeup when a selectable event occurs.
1225 selwakeup(struct selinfo
*sip
)
1228 struct lwp
*lp
= NULL
;
1230 if (sip
->si_pid
== 0)
1232 if (sip
->si_flags
& SI_COLL
) {
1234 sip
->si_flags
&= ~SI_COLL
;
1235 wakeup((caddr_t
)&selwait
); /* YYY fixable */
1237 p
= pfind(sip
->si_pid
);
1241 lp
= lwp_rb_tree_RB_LOOKUP(&p
->p_lwp_tree
, sip
->si_tid
);
1246 if (lp
->lwp_wchan
== (caddr_t
)&selwait
) {
1248 * Flag the process to break the tsleep when
1249 * setrunnable is called, but only call setrunnable
1250 * here if the process is not in a stopped state.
1252 lp
->lwp_flag
|= LWP_BREAKTSLEEP
;
1253 if (p
->p_stat
!= SSTOP
)
1255 } else if (lp
->lwp_flag
& LWP_SELECT
) {
1256 lp
->lwp_flag
&= ~LWP_SELECT
;