2 * Copyright (c) 1982, 1986, 1989, 1993
3 * The Regents of the University of California. All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 * must display the following acknowledgement:
20 * This product includes software developed by the University of
21 * California, Berkeley and its contributors.
22 * 4. Neither the name of the University nor the names of its contributors
23 * may be used to endorse or promote products derived from this software
24 * without specific prior written permission.
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38 * @(#)sys_generic.c 8.5 (Berkeley) 1/21/94
39 * $FreeBSD: src/sys/kern/sys_generic.c,v 1.55.2.10 2001/03/17 10:39:32 peter Exp $
40 * $DragonFly: src/sys/kern/sys_generic.c,v 1.49 2008/05/05 22:09:44 dillon Exp $
43 #include "opt_ktrace.h"
45 #include <sys/param.h>
46 #include <sys/systm.h>
47 #include <sys/sysproto.h>
48 #include <sys/filedesc.h>
49 #include <sys/filio.h>
50 #include <sys/fcntl.h>
53 #include <sys/signalvar.h>
54 #include <sys/socketvar.h>
56 #include <sys/kernel.h>
57 #include <sys/kern_syscall.h>
58 #include <sys/malloc.h>
59 #include <sys/mapped_ioctl.h>
61 #include <sys/queue.h>
62 #include <sys/resourcevar.h>
63 #include <sys/sysctl.h>
64 #include <sys/sysent.h>
67 #include <sys/ktrace.h>
70 #include <vm/vm_page.h>
71 #include <sys/file2.h>
73 #include <machine/limits.h>
75 static MALLOC_DEFINE(M_IOCTLOPS
, "ioctlops", "ioctl data buffer");
76 static MALLOC_DEFINE(M_IOCTLMAP
, "ioctlmap", "mapped ioctl handler buffer");
77 static MALLOC_DEFINE(M_SELECT
, "select", "select() buffer");
78 MALLOC_DEFINE(M_IOV
, "iov", "large iov's");
80 static int doselect(int nd
, fd_set
*in
, fd_set
*ou
, fd_set
*ex
,
81 struct timeval
*tv
, int *res
);
82 static int pollscan (struct proc
*, struct pollfd
*, u_int
, int *);
83 static int selscan (struct proc
*, fd_mask
**, fd_mask
**,
85 static int dofileread(int, struct file
*, struct uio
*, int, size_t *);
86 static int dofilewrite(int, struct file
*, struct uio
*, int, size_t *);
94 sys_read(struct read_args
*uap
)
96 struct thread
*td
= curthread
;
101 if ((ssize_t
)uap
->nbyte
< 0)
104 aiov
.iov_base
= uap
->buf
;
105 aiov
.iov_len
= uap
->nbyte
;
106 auio
.uio_iov
= &aiov
;
108 auio
.uio_offset
= -1;
109 auio
.uio_resid
= uap
->nbyte
;
110 auio
.uio_rw
= UIO_READ
;
111 auio
.uio_segflg
= UIO_USERSPACE
;
114 error
= kern_preadv(uap
->fd
, &auio
, 0, &uap
->sysmsg_szresult
);
119 * Positioned (Pread) read system call
124 sys_extpread(struct extpread_args
*uap
)
126 struct thread
*td
= curthread
;
132 if ((ssize_t
)uap
->nbyte
< 0)
135 aiov
.iov_base
= uap
->buf
;
136 aiov
.iov_len
= uap
->nbyte
;
137 auio
.uio_iov
= &aiov
;
139 auio
.uio_offset
= uap
->offset
;
140 auio
.uio_resid
= uap
->nbyte
;
141 auio
.uio_rw
= UIO_READ
;
142 auio
.uio_segflg
= UIO_USERSPACE
;
145 flags
= uap
->flags
& O_FMASK
;
146 if (uap
->offset
!= (off_t
)-1)
149 error
= kern_preadv(uap
->fd
, &auio
, flags
, &uap
->sysmsg_szresult
);
154 * Scatter read system call.
159 sys_readv(struct readv_args
*uap
)
161 struct thread
*td
= curthread
;
163 struct iovec aiov
[UIO_SMALLIOV
], *iov
= NULL
;
166 error
= iovec_copyin(uap
->iovp
, &iov
, aiov
, uap
->iovcnt
,
171 auio
.uio_iovcnt
= uap
->iovcnt
;
172 auio
.uio_offset
= -1;
173 auio
.uio_rw
= UIO_READ
;
174 auio
.uio_segflg
= UIO_USERSPACE
;
177 error
= kern_preadv(uap
->fd
, &auio
, 0, &uap
->sysmsg_szresult
);
179 iovec_free(&iov
, aiov
);
185 * Scatter positioned read system call.
190 sys_extpreadv(struct extpreadv_args
*uap
)
192 struct thread
*td
= curthread
;
194 struct iovec aiov
[UIO_SMALLIOV
], *iov
= NULL
;
198 error
= iovec_copyin(uap
->iovp
, &iov
, aiov
, uap
->iovcnt
,
203 auio
.uio_iovcnt
= uap
->iovcnt
;
204 auio
.uio_offset
= uap
->offset
;
205 auio
.uio_rw
= UIO_READ
;
206 auio
.uio_segflg
= UIO_USERSPACE
;
209 flags
= uap
->flags
& O_FMASK
;
210 if (uap
->offset
!= (off_t
)-1)
213 error
= kern_preadv(uap
->fd
, &auio
, flags
, &uap
->sysmsg_szresult
);
215 iovec_free(&iov
, aiov
);
223 kern_preadv(int fd
, struct uio
*auio
, int flags
, size_t *res
)
225 struct thread
*td
= curthread
;
226 struct proc
*p
= td
->td_proc
;
232 fp
= holdfp(p
->p_fd
, fd
, FREAD
);
235 if (flags
& O_FOFFSET
&& fp
->f_type
!= DTYPE_VNODE
) {
238 error
= dofileread(fd
, fp
, auio
, flags
, res
);
245 * Common code for readv and preadv that reads data in
246 * from a file using the passed in uio, offset, and flags.
248 * MPALMOSTSAFE - ktrace needs help
251 dofileread(int fd
, struct file
*fp
, struct uio
*auio
, int flags
, size_t *res
)
253 struct thread
*td
= curthread
;
257 struct iovec
*ktriov
= NULL
;
263 * if tracing, save a copy of iovec
265 if (KTRPOINT(td
, KTR_GENIO
)) {
266 int iovlen
= auio
->uio_iovcnt
* sizeof(struct iovec
);
268 MALLOC(ktriov
, struct iovec
*, iovlen
, M_TEMP
, M_WAITOK
);
269 bcopy((caddr_t
)auio
->uio_iov
, (caddr_t
)ktriov
, iovlen
);
273 len
= auio
->uio_resid
;
274 error
= fo_read(fp
, auio
, fp
->f_cred
, flags
);
276 if (auio
->uio_resid
!= len
&& (error
== ERESTART
||
277 error
== EINTR
|| error
== EWOULDBLOCK
))
281 if (ktriov
!= NULL
) {
283 ktruio
.uio_iov
= ktriov
;
284 ktruio
.uio_resid
= len
- auio
->uio_resid
;
286 ktrgenio(td
->td_lwp
, fd
, UIO_READ
, &ktruio
, error
);
289 FREE(ktriov
, M_TEMP
);
293 *res
= len
- auio
->uio_resid
;
304 sys_write(struct write_args
*uap
)
306 struct thread
*td
= curthread
;
311 if ((ssize_t
)uap
->nbyte
< 0)
314 aiov
.iov_base
= (void *)(uintptr_t)uap
->buf
;
315 aiov
.iov_len
= uap
->nbyte
;
316 auio
.uio_iov
= &aiov
;
318 auio
.uio_offset
= -1;
319 auio
.uio_resid
= uap
->nbyte
;
320 auio
.uio_rw
= UIO_WRITE
;
321 auio
.uio_segflg
= UIO_USERSPACE
;
324 error
= kern_pwritev(uap
->fd
, &auio
, 0, &uap
->sysmsg_szresult
);
335 sys_extpwrite(struct extpwrite_args
*uap
)
337 struct thread
*td
= curthread
;
343 if ((ssize_t
)uap
->nbyte
< 0)
346 aiov
.iov_base
= (void *)(uintptr_t)uap
->buf
;
347 aiov
.iov_len
= uap
->nbyte
;
348 auio
.uio_iov
= &aiov
;
350 auio
.uio_offset
= uap
->offset
;
351 auio
.uio_resid
= uap
->nbyte
;
352 auio
.uio_rw
= UIO_WRITE
;
353 auio
.uio_segflg
= UIO_USERSPACE
;
356 flags
= uap
->flags
& O_FMASK
;
357 if (uap
->offset
!= (off_t
)-1)
359 error
= kern_pwritev(uap
->fd
, &auio
, flags
, &uap
->sysmsg_szresult
);
367 sys_writev(struct writev_args
*uap
)
369 struct thread
*td
= curthread
;
371 struct iovec aiov
[UIO_SMALLIOV
], *iov
= NULL
;
374 error
= iovec_copyin(uap
->iovp
, &iov
, aiov
, uap
->iovcnt
,
379 auio
.uio_iovcnt
= uap
->iovcnt
;
380 auio
.uio_offset
= -1;
381 auio
.uio_rw
= UIO_WRITE
;
382 auio
.uio_segflg
= UIO_USERSPACE
;
385 error
= kern_pwritev(uap
->fd
, &auio
, 0, &uap
->sysmsg_szresult
);
387 iovec_free(&iov
, aiov
);
393 * Gather positioned write system call
398 sys_extpwritev(struct extpwritev_args
*uap
)
400 struct thread
*td
= curthread
;
402 struct iovec aiov
[UIO_SMALLIOV
], *iov
= NULL
;
406 error
= iovec_copyin(uap
->iovp
, &iov
, aiov
, uap
->iovcnt
,
411 auio
.uio_iovcnt
= uap
->iovcnt
;
412 auio
.uio_offset
= uap
->offset
;
413 auio
.uio_rw
= UIO_WRITE
;
414 auio
.uio_segflg
= UIO_USERSPACE
;
417 flags
= uap
->flags
& O_FMASK
;
418 if (uap
->offset
!= (off_t
)-1)
421 error
= kern_pwritev(uap
->fd
, &auio
, flags
, &uap
->sysmsg_szresult
);
423 iovec_free(&iov
, aiov
);
431 kern_pwritev(int fd
, struct uio
*auio
, int flags
, size_t *res
)
433 struct thread
*td
= curthread
;
434 struct proc
*p
= td
->td_proc
;
440 fp
= holdfp(p
->p_fd
, fd
, FWRITE
);
443 else if ((flags
& O_FOFFSET
) && fp
->f_type
!= DTYPE_VNODE
) {
446 error
= dofilewrite(fd
, fp
, auio
, flags
, res
);
454 * Common code for writev and pwritev that writes data to
455 * a file using the passed in uio, offset, and flags.
457 * MPALMOSTSAFE - ktrace needs help
460 dofilewrite(int fd
, struct file
*fp
, struct uio
*auio
, int flags
, size_t *res
)
462 struct thread
*td
= curthread
;
463 struct lwp
*lp
= td
->td_lwp
;
467 struct iovec
*ktriov
= NULL
;
473 * if tracing, save a copy of iovec and uio
475 if (KTRPOINT(td
, KTR_GENIO
)) {
476 int iovlen
= auio
->uio_iovcnt
* sizeof(struct iovec
);
478 MALLOC(ktriov
, struct iovec
*, iovlen
, M_TEMP
, M_WAITOK
);
479 bcopy((caddr_t
)auio
->uio_iov
, (caddr_t
)ktriov
, iovlen
);
483 len
= auio
->uio_resid
;
484 error
= fo_write(fp
, auio
, fp
->f_cred
, flags
);
486 if (auio
->uio_resid
!= len
&& (error
== ERESTART
||
487 error
== EINTR
|| error
== EWOULDBLOCK
))
489 /* Socket layer is responsible for issuing SIGPIPE. */
490 if (error
== EPIPE
) {
492 lwpsignal(lp
->lwp_proc
, lp
, SIGPIPE
);
497 if (ktriov
!= NULL
) {
499 ktruio
.uio_iov
= ktriov
;
500 ktruio
.uio_resid
= len
- auio
->uio_resid
;
502 ktrgenio(lp
, fd
, UIO_WRITE
, &ktruio
, error
);
505 FREE(ktriov
, M_TEMP
);
509 *res
= len
- auio
->uio_resid
;
519 sys_ioctl(struct ioctl_args
*uap
)
521 return(mapped_ioctl(uap
->fd
, uap
->com
, uap
->data
, NULL
, &uap
->sysmsg
));
524 struct ioctl_map_entry
{
526 struct ioctl_map_range
*cmd_ranges
;
527 LIST_ENTRY(ioctl_map_entry
) entries
;
531 * The true heart of all ioctl syscall handlers (native, emulation).
532 * If map != NULL, it will be searched for a matching entry for com,
533 * and appropriate conversions/conversion functions will be utilized.
536 mapped_ioctl(int fd
, u_long com
, caddr_t uspc_data
, struct ioctl_map
*map
,
539 struct thread
*td
= curthread
;
540 struct proc
*p
= td
->td_proc
;
543 struct ioctl_map_range
*iomc
= NULL
;
549 #define STK_PARAMS 128
551 char stkbuf
[STK_PARAMS
];
558 fp
= holdfp(p
->p_fd
, fd
, FREAD
|FWRITE
);
562 if (map
!= NULL
) { /* obey translation map */
564 struct ioctl_map_entry
*e
;
566 maskcmd
= com
& map
->mask
;
568 LIST_FOREACH(e
, &map
->mapping
, entries
) {
569 for (iomc
= e
->cmd_ranges
; iomc
->start
!= 0 ||
570 iomc
->maptocmd
!= 0 || iomc
->wrapfunc
!= NULL
||
571 iomc
->mapfunc
!= NULL
;
573 if (maskcmd
>= iomc
->start
&&
574 maskcmd
<= iomc
->end
)
578 /* Did we find a match? */
579 if (iomc
->start
!= 0 || iomc
->maptocmd
!= 0 ||
580 iomc
->wrapfunc
!= NULL
|| iomc
->mapfunc
!= NULL
)
585 (iomc
->start
== 0 && iomc
->maptocmd
== 0
586 && iomc
->wrapfunc
== NULL
&& iomc
->mapfunc
== NULL
)) {
587 kprintf("%s: 'ioctl' fd=%d, cmd=0x%lx ('%c',%d) not implemented\n",
588 map
->sys
, fd
, maskcmd
,
589 (int)((maskcmd
>> 8) & 0xff),
590 (int)(maskcmd
& 0xff));
596 * If it's a non-range one to one mapping, maptocmd should be
597 * correct. If it's a ranged one to one mapping, we pass the
598 * original value of com, and for a range mapped to a different
599 * range, we always need a mapping function to translate the
600 * ioctl to our native ioctl. Ex. 6500-65ff <-> 9500-95ff
602 if (iomc
->start
== iomc
->end
&& iomc
->maptocmd
== iomc
->maptoend
) {
603 com
= iomc
->maptocmd
;
604 } else if (iomc
->start
== iomc
->maptocmd
&& iomc
->end
== iomc
->maptoend
) {
605 if (iomc
->mapfunc
!= NULL
)
606 com
= iomc
->mapfunc(iomc
->start
, iomc
->end
,
607 iomc
->start
, iomc
->end
,
610 if (iomc
->mapfunc
!= NULL
) {
611 com
= iomc
->mapfunc(iomc
->start
, iomc
->end
,
612 iomc
->maptocmd
, iomc
->maptoend
,
615 kprintf("%s: Invalid mapping for fd=%d, cmd=%#lx ('%c',%d)\n",
616 map
->sys
, fd
, maskcmd
,
617 (int)((maskcmd
>> 8) & 0xff),
618 (int)(maskcmd
& 0xff));
627 error
= fclrfdflags(p
->p_fd
, fd
, UF_EXCLOSE
);
630 error
= fsetfdflags(p
->p_fd
, fd
, UF_EXCLOSE
);
635 * Interpret high order word to find amount of data to be
636 * copied to/from the user's address space.
638 size
= IOCPARM_LEN(com
);
639 if (size
> IOCPARM_MAX
) {
645 if (size
> sizeof (ubuf
.stkbuf
)) {
646 memp
= kmalloc(size
, M_IOCTLOPS
, M_WAITOK
);
651 if ((com
& IOC_IN
) != 0) {
653 error
= copyin(uspc_data
, data
, (size_t)size
);
656 kfree(memp
, M_IOCTLOPS
);
660 *(caddr_t
*)data
= uspc_data
;
662 } else if ((com
& IOC_OUT
) != 0 && size
) {
664 * Zero the buffer so the user always
665 * gets back something deterministic.
667 bzero(data
, (size_t)size
);
668 } else if ((com
& IOC_VOID
) != 0) {
669 *(caddr_t
*)data
= uspc_data
;
674 if ((tmp
= *(int *)data
))
675 fp
->f_flag
|= FNONBLOCK
;
677 fp
->f_flag
&= ~FNONBLOCK
;
682 if ((tmp
= *(int *)data
))
683 fp
->f_flag
|= FASYNC
;
685 fp
->f_flag
&= ~FASYNC
;
686 error
= fo_ioctl(fp
, FIOASYNC
, (caddr_t
)&tmp
, cred
, msg
);
691 * If there is a override function,
692 * call it instead of directly routing the call
694 if (map
!= NULL
&& iomc
->wrapfunc
!= NULL
)
695 error
= iomc
->wrapfunc(fp
, com
, ocom
, data
, cred
);
697 error
= fo_ioctl(fp
, com
, data
, cred
, msg
);
699 * Copy any data to user, size was
700 * already set and checked above.
702 if (error
== 0 && (com
& IOC_OUT
) != 0 && size
!= 0)
703 error
= copyout(data
, uspc_data
, (size_t)size
);
707 kfree(memp
, M_IOCTLOPS
);
714 mapped_ioctl_register_handler(struct ioctl_map_handler
*he
)
716 struct ioctl_map_entry
*ne
;
718 KKASSERT(he
!= NULL
&& he
->map
!= NULL
&& he
->cmd_ranges
!= NULL
&&
719 he
->subsys
!= NULL
&& *he
->subsys
!= '\0');
721 ne
= kmalloc(sizeof(struct ioctl_map_entry
), M_IOCTLMAP
, M_WAITOK
);
723 ne
->subsys
= he
->subsys
;
724 ne
->cmd_ranges
= he
->cmd_ranges
;
726 LIST_INSERT_HEAD(&he
->map
->mapping
, ne
, entries
);
732 mapped_ioctl_unregister_handler(struct ioctl_map_handler
*he
)
734 struct ioctl_map_entry
*ne
;
736 KKASSERT(he
!= NULL
&& he
->map
!= NULL
&& he
->cmd_ranges
!= NULL
);
738 LIST_FOREACH(ne
, &he
->map
->mapping
, entries
) {
739 if (ne
->cmd_ranges
!= he
->cmd_ranges
)
741 LIST_REMOVE(ne
, entries
);
742 kfree(ne
, M_IOCTLMAP
);
748 static int nselcoll
; /* Select collisions since boot */
750 SYSCTL_INT(_kern
, OID_AUTO
, nselcoll
, CTLFLAG_RD
, &nselcoll
, 0, "");
753 * Select system call.
756 sys_select(struct select_args
*uap
)
759 struct timeval
*ktvp
;
763 * Get timeout if any.
765 if (uap
->tv
!= NULL
) {
766 error
= copyin(uap
->tv
, &ktv
, sizeof (ktv
));
769 error
= itimerfix(&ktv
);
780 error
= doselect(uap
->nd
, uap
->in
, uap
->ou
, uap
->ex
, ktvp
,
781 &uap
->sysmsg_result
);
788 * Pselect system call.
791 sys_pselect(struct pselect_args
*uap
)
793 struct thread
*td
= curthread
;
794 struct lwp
*lp
= td
->td_lwp
;
797 struct timeval
*ktvp
;
802 * Get timeout if any and convert it.
803 * Round up during conversion to avoid timeout going off early.
805 if (uap
->ts
!= NULL
) {
806 error
= copyin(uap
->ts
, &kts
, sizeof (kts
));
809 ktv
.tv_sec
= kts
.tv_sec
;
810 ktv
.tv_usec
= (kts
.tv_nsec
+ 999) / 1000;
811 error
= itimerfix(&ktv
);
820 * Install temporary signal mask if any provided.
822 if (uap
->sigmask
!= NULL
) {
823 error
= copyin(uap
->sigmask
, &sigmask
, sizeof(sigmask
));
826 lp
->lwp_oldsigmask
= lp
->lwp_sigmask
;
827 SIG_CANTMASK(sigmask
);
828 lp
->lwp_sigmask
= sigmask
;
834 error
= doselect(uap
->nd
, uap
->in
, uap
->ou
, uap
->ex
, ktvp
,
835 &uap
->sysmsg_result
);
837 if (uap
->sigmask
!= NULL
) {
838 /* doselect() responsible for turning ERESTART into EINTR */
839 KKASSERT(error
!= ERESTART
);
840 if (error
== EINTR
) {
842 * We can't restore the previous signal mask now
843 * because it could block the signal that interrupted
844 * us. So make a note to restore it after executing
847 lp
->lwp_flag
|= LWP_OLDMASK
;
850 * No handler to run. Restore previous mask immediately.
852 lp
->lwp_sigmask
= lp
->lwp_oldsigmask
;
860 * Common code for sys_select() and sys_pselect().
862 * in, out and ex are userland pointers. tv must point to validated
863 * kernel-side timeout value or NULL for infinite timeout. res must
864 * point to syscall return value.
867 doselect(int nd
, fd_set
*in
, fd_set
*ou
, fd_set
*ex
, struct timeval
*tv
,
870 struct lwp
*lp
= curthread
->td_lwp
;
871 struct proc
*p
= curproc
;
874 * The magic 2048 here is chosen to be just enough for FD_SETSIZE
875 * infds with the new FD_SETSIZE of 1024, and more than enough for
876 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE
879 fd_mask s_selbits
[howmany(2048, NFDBITS
)];
880 fd_mask
*ibits
[3], *obits
[3], *selbits
, *sbp
;
881 struct timeval atv
, rtv
, ttv
;
882 int ncoll
, error
, timo
;
883 u_int nbufbytes
, ncpbytes
, nfdbits
;
887 if (nd
> p
->p_fd
->fd_nfiles
)
888 nd
= p
->p_fd
->fd_nfiles
; /* forgiving; slightly wrong */
891 * Allocate just enough bits for the non-null fd_sets. Use the
892 * preallocated auto buffer if possible.
894 nfdbits
= roundup(nd
, NFDBITS
);
895 ncpbytes
= nfdbits
/ NBBY
;
898 nbufbytes
+= 2 * ncpbytes
;
900 nbufbytes
+= 2 * ncpbytes
;
902 nbufbytes
+= 2 * ncpbytes
;
903 if (nbufbytes
<= sizeof s_selbits
)
904 selbits
= &s_selbits
[0];
906 selbits
= kmalloc(nbufbytes
, M_SELECT
, M_WAITOK
);
909 * Assign pointers into the bit buffers and fetch the input bits.
910 * Put the output buffers together so that they can be bzeroed
914 #define getbits(name, x) \
919 ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp; \
921 sbp += ncpbytes / sizeof *sbp; \
922 error = copyin(name, ibits[x], ncpbytes); \
932 bzero(selbits
, nbufbytes
/ 2);
936 getmicrouptime(&rtv
);
937 timevaladd(&atv
, &rtv
);
945 lp
->lwp_flag
|= LWP_SELECT
;
946 error
= selscan(p
, ibits
, obits
, nd
, res
);
949 if (atv
.tv_sec
|| atv
.tv_usec
) {
950 getmicrouptime(&rtv
);
951 if (timevalcmp(&rtv
, &atv
, >=))
954 timevalsub(&ttv
, &rtv
);
955 timo
= ttv
.tv_sec
> 24 * 60 * 60 ?
956 24 * 60 * 60 * hz
: tvtohz_high(&ttv
);
959 if ((lp
->lwp_flag
& LWP_SELECT
) == 0 || nselcoll
!= ncoll
) {
963 lp
->lwp_flag
&= ~LWP_SELECT
;
965 error
= tsleep((caddr_t
)&selwait
, PCATCH
, "select", timo
);
971 lp
->lwp_flag
&= ~LWP_SELECT
;
972 /* select is not restarted after signals... */
973 if (error
== ERESTART
)
975 if (error
== EWOULDBLOCK
)
977 #define putbits(name, x) \
978 if (name && (error2 = copyout(obits[x], name, ncpbytes))) \
988 if (selbits
!= &s_selbits
[0])
989 kfree(selbits
, M_SELECT
);
994 selscan(struct proc
*p
, fd_mask
**ibits
, fd_mask
**obits
, int nfd
, int *res
)
1000 /* Note: backend also returns POLLHUP/POLLERR if appropriate. */
1001 static int flag
[3] = { POLLRDNORM
, POLLWRNORM
, POLLRDBAND
};
1003 for (msk
= 0; msk
< 3; msk
++) {
1004 if (ibits
[msk
] == NULL
)
1006 for (i
= 0; i
< nfd
; i
+= NFDBITS
) {
1007 bits
= ibits
[msk
][i
/NFDBITS
];
1008 /* ffs(int mask) not portable, fd_mask is long */
1009 for (fd
= i
; bits
&& fd
< nfd
; fd
++, bits
>>= 1) {
1012 fp
= holdfp(p
->p_fd
, fd
, -1);
1015 if (fo_poll(fp
, flag
[msk
], fp
->f_cred
)) {
1016 obits
[msk
][(fd
)/NFDBITS
] |=
1017 ((fd_mask
)1 << ((fd
) % NFDBITS
));
1032 sys_poll(struct poll_args
*uap
)
1034 struct pollfd
*bits
;
1035 struct pollfd smallbits
[32];
1036 struct timeval atv
, rtv
, ttv
;
1037 int ncoll
, error
= 0, timo
;
1040 struct lwp
*lp
= curthread
->td_lwp
;
1041 struct proc
*p
= curproc
;
1045 * This is kinda bogus. We have fd limits, but that is not
1046 * really related to the size of the pollfd array. Make sure
1047 * we let the process use at least FD_SETSIZE entries and at
1048 * least enough for the current limits. We want to be reasonably
1049 * safe, but not overly restrictive.
1051 if (nfds
> p
->p_rlimit
[RLIMIT_NOFILE
].rlim_cur
&& nfds
> FD_SETSIZE
)
1053 ni
= nfds
* sizeof(struct pollfd
);
1054 if (ni
> sizeof(smallbits
))
1055 bits
= kmalloc(ni
, M_TEMP
, M_WAITOK
);
1058 error
= copyin(uap
->fds
, bits
, ni
);
1061 if (uap
->timeout
!= INFTIM
) {
1062 atv
.tv_sec
= uap
->timeout
/ 1000;
1063 atv
.tv_usec
= (uap
->timeout
% 1000) * 1000;
1064 if (itimerfix(&atv
)) {
1068 getmicrouptime(&rtv
);
1069 timevaladd(&atv
, &rtv
);
1077 lp
->lwp_flag
|= LWP_SELECT
;
1078 error
= pollscan(p
, bits
, nfds
, &uap
->sysmsg_result
);
1079 if (error
|| uap
->sysmsg_result
)
1081 if (atv
.tv_sec
|| atv
.tv_usec
) {
1082 getmicrouptime(&rtv
);
1083 if (timevalcmp(&rtv
, &atv
, >=))
1086 timevalsub(&ttv
, &rtv
);
1087 timo
= ttv
.tv_sec
> 24 * 60 * 60 ?
1088 24 * 60 * 60 * hz
: tvtohz_high(&ttv
);
1091 if ((lp
->lwp_flag
& LWP_SELECT
) == 0 || nselcoll
!= ncoll
) {
1095 lp
->lwp_flag
&= ~LWP_SELECT
;
1096 error
= tsleep((caddr_t
)&selwait
, PCATCH
, "poll", timo
);
1101 lp
->lwp_flag
&= ~LWP_SELECT
;
1102 /* poll is not restarted after signals... */
1103 if (error
== ERESTART
)
1105 if (error
== EWOULDBLOCK
)
1108 error
= copyout(bits
, uap
->fds
, ni
);
1113 if (ni
> sizeof(smallbits
))
1114 kfree(bits
, M_TEMP
);
1119 pollscan(struct proc
*p
, struct pollfd
*fds
, u_int nfd
, int *res
)
1125 for (i
= 0; i
< nfd
; i
++, fds
++) {
1126 if (fds
->fd
>= p
->p_fd
->fd_nfiles
) {
1127 fds
->revents
= POLLNVAL
;
1129 } else if (fds
->fd
< 0) {
1132 fp
= holdfp(p
->p_fd
, fds
->fd
, -1);
1134 fds
->revents
= POLLNVAL
;
1138 * Note: backend also returns POLLHUP and
1139 * POLLERR if appropriate.
1141 fds
->revents
= fo_poll(fp
, fds
->events
,
1143 if (fds
->revents
!= 0)
1154 * OpenBSD poll system call.
1155 * XXX this isn't quite a true representation.. OpenBSD uses select ops.
1158 sys_openbsd_poll(struct openbsd_poll_args
*uap
)
1160 return (sys_poll((struct poll_args
*)uap
));
1165 seltrue(cdev_t dev
, int events
)
1167 return (events
& (POLLIN
| POLLOUT
| POLLRDNORM
| POLLWRNORM
));
1171 * Record a select request. A global wait must be used since a process/thread
1172 * might go away after recording its request.
1175 selrecord(struct thread
*selector
, struct selinfo
*sip
)
1178 struct lwp
*lp
= NULL
;
1180 if (selector
->td_lwp
== NULL
)
1181 panic("selrecord: thread needs a process");
1183 if (sip
->si_pid
== selector
->td_proc
->p_pid
&&
1184 sip
->si_tid
== selector
->td_lwp
->lwp_tid
)
1186 if (sip
->si_pid
&& (p
= pfind(sip
->si_pid
)))
1187 lp
= lwp_rb_tree_RB_LOOKUP(&p
->p_lwp_tree
, sip
->si_tid
);
1188 if (lp
!= NULL
&& lp
->lwp_wchan
== (caddr_t
)&selwait
) {
1189 sip
->si_flags
|= SI_COLL
;
1191 sip
->si_pid
= selector
->td_proc
->p_pid
;
1192 sip
->si_tid
= selector
->td_lwp
->lwp_tid
;
1197 * Do a wakeup when a selectable event occurs.
1200 selwakeup(struct selinfo
*sip
)
1203 struct lwp
*lp
= NULL
;
1205 if (sip
->si_pid
== 0)
1207 if (sip
->si_flags
& SI_COLL
) {
1209 sip
->si_flags
&= ~SI_COLL
;
1210 wakeup((caddr_t
)&selwait
); /* YYY fixable */
1212 p
= pfind(sip
->si_pid
);
1216 lp
= lwp_rb_tree_RB_LOOKUP(&p
->p_lwp_tree
, sip
->si_tid
);
1221 if (lp
->lwp_wchan
== (caddr_t
)&selwait
) {
1223 * Flag the process to break the tsleep when
1224 * setrunnable is called, but only call setrunnable
1225 * here if the process is not in a stopped state.
1227 lp
->lwp_flag
|= LWP_BREAKTSLEEP
;
1228 if (p
->p_stat
!= SSTOP
)
1230 } else if (lp
->lwp_flag
& LWP_SELECT
) {
1231 lp
->lwp_flag
&= ~LWP_SELECT
;