2 * Copyright (c) 1982, 1986, 1989, 1993
3 * The Regents of the University of California. All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 * must display the following acknowledgement:
20 * This product includes software developed by the University of
21 * California, Berkeley and its contributors.
22 * 4. Neither the name of the University nor the names of its contributors
23 * may be used to endorse or promote products derived from this software
24 * without specific prior written permission.
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38 * @(#)sys_generic.c 8.5 (Berkeley) 1/21/94
39 * $FreeBSD: src/sys/kern/sys_generic.c,v 1.55.2.10 2001/03/17 10:39:32 peter Exp $
40 * $DragonFly: src/sys/kern/sys_generic.c,v 1.49 2008/05/05 22:09:44 dillon Exp $
43 #include "opt_ktrace.h"
45 #include <sys/param.h>
46 #include <sys/systm.h>
47 #include <sys/sysproto.h>
48 #include <sys/filedesc.h>
49 #include <sys/filio.h>
50 #include <sys/fcntl.h>
53 #include <sys/signalvar.h>
54 #include <sys/socketvar.h>
56 #include <sys/kernel.h>
57 #include <sys/kern_syscall.h>
58 #include <sys/malloc.h>
59 #include <sys/mapped_ioctl.h>
61 #include <sys/queue.h>
62 #include <sys/resourcevar.h>
63 #include <sys/sysctl.h>
64 #include <sys/sysent.h>
67 #include <sys/ktrace.h>
70 #include <vm/vm_page.h>
71 #include <sys/file2.h>
73 #include <machine/limits.h>
75 static MALLOC_DEFINE(M_IOCTLOPS
, "ioctlops", "ioctl data buffer");
76 static MALLOC_DEFINE(M_IOCTLMAP
, "ioctlmap", "mapped ioctl handler buffer");
77 static MALLOC_DEFINE(M_SELECT
, "select", "select() buffer");
78 MALLOC_DEFINE(M_IOV
, "iov", "large iov's");
80 static int doselect(int nd
, fd_set
*in
, fd_set
*ou
, fd_set
*ex
,
81 struct timeval
*tv
, int *res
);
82 static int pollscan (struct proc
*, struct pollfd
*, u_int
, int *);
83 static int selscan (struct proc
*, fd_mask
**, fd_mask
**,
85 static int dofileread(int, struct file
*, struct uio
*, int, int *);
86 static int dofilewrite(int, struct file
*, struct uio
*, int, int *);
94 sys_read(struct read_args
*uap
)
96 struct thread
*td
= curthread
;
101 aiov
.iov_base
= uap
->buf
;
102 aiov
.iov_len
= uap
->nbyte
;
103 auio
.uio_iov
= &aiov
;
105 auio
.uio_offset
= -1;
106 auio
.uio_resid
= uap
->nbyte
;
107 auio
.uio_rw
= UIO_READ
;
108 auio
.uio_segflg
= UIO_USERSPACE
;
111 if (auio
.uio_resid
< 0)
114 error
= kern_preadv(uap
->fd
, &auio
, 0, &uap
->sysmsg_result
);
119 * Positioned (Pread) read system call
124 sys_extpread(struct extpread_args
*uap
)
126 struct thread
*td
= curthread
;
132 aiov
.iov_base
= uap
->buf
;
133 aiov
.iov_len
= uap
->nbyte
;
134 auio
.uio_iov
= &aiov
;
136 auio
.uio_offset
= uap
->offset
;
137 auio
.uio_resid
= uap
->nbyte
;
138 auio
.uio_rw
= UIO_READ
;
139 auio
.uio_segflg
= UIO_USERSPACE
;
142 flags
= uap
->flags
& O_FMASK
;
143 if (uap
->offset
!= (off_t
)-1)
146 if (auio
.uio_resid
< 0)
149 error
= kern_preadv(uap
->fd
, &auio
, flags
, &uap
->sysmsg_result
);
154 * Scatter read system call.
159 sys_readv(struct readv_args
*uap
)
161 struct thread
*td
= curthread
;
163 struct iovec aiov
[UIO_SMALLIOV
], *iov
= NULL
;
166 error
= iovec_copyin(uap
->iovp
, &iov
, aiov
, uap
->iovcnt
,
171 auio
.uio_iovcnt
= uap
->iovcnt
;
172 auio
.uio_offset
= -1;
173 auio
.uio_rw
= UIO_READ
;
174 auio
.uio_segflg
= UIO_USERSPACE
;
177 error
= kern_preadv(uap
->fd
, &auio
, 0, &uap
->sysmsg_result
);
179 iovec_free(&iov
, aiov
);
185 * Scatter positioned read system call.
190 sys_extpreadv(struct extpreadv_args
*uap
)
192 struct thread
*td
= curthread
;
194 struct iovec aiov
[UIO_SMALLIOV
], *iov
= NULL
;
198 error
= iovec_copyin(uap
->iovp
, &iov
, aiov
, uap
->iovcnt
,
203 auio
.uio_iovcnt
= uap
->iovcnt
;
204 auio
.uio_offset
= uap
->offset
;
205 auio
.uio_rw
= UIO_READ
;
206 auio
.uio_segflg
= UIO_USERSPACE
;
209 flags
= uap
->flags
& O_FMASK
;
210 if (uap
->offset
!= (off_t
)-1)
213 error
= kern_preadv(uap
->fd
, &auio
, flags
, &uap
->sysmsg_result
);
215 iovec_free(&iov
, aiov
);
223 kern_preadv(int fd
, struct uio
*auio
, int flags
, int *res
)
225 struct thread
*td
= curthread
;
226 struct proc
*p
= td
->td_proc
;
232 fp
= holdfp(p
->p_fd
, fd
, FREAD
);
235 if (flags
& O_FOFFSET
&& fp
->f_type
!= DTYPE_VNODE
) {
237 } else if (auio
->uio_resid
< 0) {
240 error
= dofileread(fd
, fp
, auio
, flags
, res
);
247 * Common code for readv and preadv that reads data in
248 * from a file using the passed in uio, offset, and flags.
250 * MPALMOSTSAFE - ktrace needs help
253 dofileread(int fd
, struct file
*fp
, struct uio
*auio
, int flags
, int *res
)
255 struct thread
*td
= curthread
;
259 struct iovec
*ktriov
= NULL
;
265 * if tracing, save a copy of iovec
267 if (KTRPOINT(td
, KTR_GENIO
)) {
268 int iovlen
= auio
->uio_iovcnt
* sizeof(struct iovec
);
270 MALLOC(ktriov
, struct iovec
*, iovlen
, M_TEMP
, M_WAITOK
);
271 bcopy((caddr_t
)auio
->uio_iov
, (caddr_t
)ktriov
, iovlen
);
275 len
= auio
->uio_resid
;
276 error
= fo_read(fp
, auio
, fp
->f_cred
, flags
);
278 if (auio
->uio_resid
!= len
&& (error
== ERESTART
||
279 error
== EINTR
|| error
== EWOULDBLOCK
))
283 if (ktriov
!= NULL
) {
285 ktruio
.uio_iov
= ktriov
;
286 ktruio
.uio_resid
= len
- auio
->uio_resid
;
288 ktrgenio(td
->td_lwp
, fd
, UIO_READ
, &ktruio
, error
);
291 FREE(ktriov
, M_TEMP
);
295 *res
= len
- auio
->uio_resid
;
306 sys_write(struct write_args
*uap
)
308 struct thread
*td
= curthread
;
313 aiov
.iov_base
= (void *)(uintptr_t)uap
->buf
;
314 aiov
.iov_len
= uap
->nbyte
;
315 auio
.uio_iov
= &aiov
;
317 auio
.uio_offset
= -1;
318 auio
.uio_resid
= uap
->nbyte
;
319 auio
.uio_rw
= UIO_WRITE
;
320 auio
.uio_segflg
= UIO_USERSPACE
;
323 if (auio
.uio_resid
< 0)
326 error
= kern_pwritev(uap
->fd
, &auio
, 0, &uap
->sysmsg_result
);
337 sys_extpwrite(struct extpwrite_args
*uap
)
339 struct thread
*td
= curthread
;
345 aiov
.iov_base
= (void *)(uintptr_t)uap
->buf
;
346 aiov
.iov_len
= uap
->nbyte
;
347 auio
.uio_iov
= &aiov
;
349 auio
.uio_offset
= uap
->offset
;
350 auio
.uio_resid
= uap
->nbyte
;
351 auio
.uio_rw
= UIO_WRITE
;
352 auio
.uio_segflg
= UIO_USERSPACE
;
355 flags
= uap
->flags
& O_FMASK
;
356 if (uap
->offset
!= (off_t
)-1)
359 if (auio
.uio_resid
< 0)
362 error
= kern_pwritev(uap
->fd
, &auio
, flags
, &uap
->sysmsg_result
);
371 sys_writev(struct writev_args
*uap
)
373 struct thread
*td
= curthread
;
375 struct iovec aiov
[UIO_SMALLIOV
], *iov
= NULL
;
378 error
= iovec_copyin(uap
->iovp
, &iov
, aiov
, uap
->iovcnt
,
383 auio
.uio_iovcnt
= uap
->iovcnt
;
384 auio
.uio_offset
= -1;
385 auio
.uio_rw
= UIO_WRITE
;
386 auio
.uio_segflg
= UIO_USERSPACE
;
389 error
= kern_pwritev(uap
->fd
, &auio
, 0, &uap
->sysmsg_result
);
391 iovec_free(&iov
, aiov
);
397 * Gather positioned write system call
402 sys_extpwritev(struct extpwritev_args
*uap
)
404 struct thread
*td
= curthread
;
406 struct iovec aiov
[UIO_SMALLIOV
], *iov
= NULL
;
410 error
= iovec_copyin(uap
->iovp
, &iov
, aiov
, uap
->iovcnt
,
415 auio
.uio_iovcnt
= uap
->iovcnt
;
416 auio
.uio_offset
= uap
->offset
;
417 auio
.uio_rw
= UIO_WRITE
;
418 auio
.uio_segflg
= UIO_USERSPACE
;
421 flags
= uap
->flags
& O_FMASK
;
422 if (uap
->offset
!= (off_t
)-1)
425 error
= kern_pwritev(uap
->fd
, &auio
, flags
, &uap
->sysmsg_result
);
427 iovec_free(&iov
, aiov
);
435 kern_pwritev(int fd
, struct uio
*auio
, int flags
, int *res
)
437 struct thread
*td
= curthread
;
438 struct proc
*p
= td
->td_proc
;
444 fp
= holdfp(p
->p_fd
, fd
, FWRITE
);
447 else if ((flags
& O_FOFFSET
) && fp
->f_type
!= DTYPE_VNODE
) {
450 error
= dofilewrite(fd
, fp
, auio
, flags
, res
);
458 * Common code for writev and pwritev that writes data to
459 * a file using the passed in uio, offset, and flags.
461 * MPALMOSTSAFE - ktrace needs help
464 dofilewrite(int fd
, struct file
*fp
, struct uio
*auio
, int flags
, int *res
)
466 struct thread
*td
= curthread
;
467 struct lwp
*lp
= td
->td_lwp
;
471 struct iovec
*ktriov
= NULL
;
477 * if tracing, save a copy of iovec and uio
479 if (KTRPOINT(td
, KTR_GENIO
)) {
480 int iovlen
= auio
->uio_iovcnt
* sizeof(struct iovec
);
482 MALLOC(ktriov
, struct iovec
*, iovlen
, M_TEMP
, M_WAITOK
);
483 bcopy((caddr_t
)auio
->uio_iov
, (caddr_t
)ktriov
, iovlen
);
487 len
= auio
->uio_resid
;
488 error
= fo_write(fp
, auio
, fp
->f_cred
, flags
);
490 if (auio
->uio_resid
!= len
&& (error
== ERESTART
||
491 error
== EINTR
|| error
== EWOULDBLOCK
))
493 /* Socket layer is responsible for issuing SIGPIPE. */
494 if (error
== EPIPE
) {
496 lwpsignal(lp
->lwp_proc
, lp
, SIGPIPE
);
501 if (ktriov
!= NULL
) {
503 ktruio
.uio_iov
= ktriov
;
504 ktruio
.uio_resid
= len
- auio
->uio_resid
;
506 ktrgenio(lp
, fd
, UIO_WRITE
, &ktruio
, error
);
509 FREE(ktriov
, M_TEMP
);
513 *res
= len
- auio
->uio_resid
;
523 sys_ioctl(struct ioctl_args
*uap
)
525 return(mapped_ioctl(uap
->fd
, uap
->com
, uap
->data
, NULL
));
528 struct ioctl_map_entry
{
530 struct ioctl_map_range
*cmd_ranges
;
531 LIST_ENTRY(ioctl_map_entry
) entries
;
535 * The true heart of all ioctl syscall handlers (native, emulation).
536 * If map != NULL, it will be searched for a matching entry for com,
537 * and appropriate conversions/conversion functions will be utilized.
540 mapped_ioctl(int fd
, u_long com
, caddr_t uspc_data
, struct ioctl_map
*map
)
542 struct thread
*td
= curthread
;
543 struct proc
*p
= td
->td_proc
;
546 struct ioctl_map_range
*iomc
= NULL
;
552 #define STK_PARAMS 128
554 char stkbuf
[STK_PARAMS
];
561 fp
= holdfp(p
->p_fd
, fd
, FREAD
|FWRITE
);
565 if (map
!= NULL
) { /* obey translation map */
567 struct ioctl_map_entry
*e
;
569 maskcmd
= com
& map
->mask
;
571 LIST_FOREACH(e
, &map
->mapping
, entries
) {
572 for (iomc
= e
->cmd_ranges
; iomc
->start
!= 0 ||
573 iomc
->maptocmd
!= 0 || iomc
->wrapfunc
!= NULL
||
574 iomc
->mapfunc
!= NULL
;
576 if (maskcmd
>= iomc
->start
&&
577 maskcmd
<= iomc
->end
)
581 /* Did we find a match? */
582 if (iomc
->start
!= 0 || iomc
->maptocmd
!= 0 ||
583 iomc
->wrapfunc
!= NULL
|| iomc
->mapfunc
!= NULL
)
588 (iomc
->start
== 0 && iomc
->maptocmd
== 0
589 && iomc
->wrapfunc
== NULL
&& iomc
->mapfunc
== NULL
)) {
590 kprintf("%s: 'ioctl' fd=%d, cmd=0x%lx ('%c',%d) not implemented\n",
591 map
->sys
, fd
, maskcmd
,
592 (int)((maskcmd
>> 8) & 0xff),
593 (int)(maskcmd
& 0xff));
599 * If it's a non-range one to one mapping, maptocmd should be
600 * correct. If it's a ranged one to one mapping, we pass the
601 * original value of com, and for a range mapped to a different
602 * range, we always need a mapping function to translate the
603 * ioctl to our native ioctl. Ex. 6500-65ff <-> 9500-95ff
605 if (iomc
->start
== iomc
->end
&& iomc
->maptocmd
== iomc
->maptoend
) {
606 com
= iomc
->maptocmd
;
607 } else if (iomc
->start
== iomc
->maptocmd
&& iomc
->end
== iomc
->maptoend
) {
608 if (iomc
->mapfunc
!= NULL
)
609 com
= iomc
->mapfunc(iomc
->start
, iomc
->end
,
610 iomc
->start
, iomc
->end
,
613 if (iomc
->mapfunc
!= NULL
) {
614 com
= iomc
->mapfunc(iomc
->start
, iomc
->end
,
615 iomc
->maptocmd
, iomc
->maptoend
,
618 kprintf("%s: Invalid mapping for fd=%d, cmd=%#lx ('%c',%d)\n",
619 map
->sys
, fd
, maskcmd
,
620 (int)((maskcmd
>> 8) & 0xff),
621 (int)(maskcmd
& 0xff));
630 error
= fclrfdflags(p
->p_fd
, fd
, UF_EXCLOSE
);
633 error
= fsetfdflags(p
->p_fd
, fd
, UF_EXCLOSE
);
638 * Interpret high order word to find amount of data to be
639 * copied to/from the user's address space.
641 size
= IOCPARM_LEN(com
);
642 if (size
> IOCPARM_MAX
) {
648 if (size
> sizeof (ubuf
.stkbuf
)) {
649 memp
= kmalloc(size
, M_IOCTLOPS
, M_WAITOK
);
654 if ((com
& IOC_IN
) != 0) {
656 error
= copyin(uspc_data
, data
, (u_int
)size
);
659 kfree(memp
, M_IOCTLOPS
);
663 *(caddr_t
*)data
= uspc_data
;
665 } else if ((com
& IOC_OUT
) != 0 && size
) {
667 * Zero the buffer so the user always
668 * gets back something deterministic.
671 } else if ((com
& IOC_VOID
) != 0) {
672 *(caddr_t
*)data
= uspc_data
;
677 if ((tmp
= *(int *)data
))
678 fp
->f_flag
|= FNONBLOCK
;
680 fp
->f_flag
&= ~FNONBLOCK
;
685 if ((tmp
= *(int *)data
))
686 fp
->f_flag
|= FASYNC
;
688 fp
->f_flag
&= ~FASYNC
;
689 error
= fo_ioctl(fp
, FIOASYNC
, (caddr_t
)&tmp
, cred
);
694 * If there is a override function,
695 * call it instead of directly routing the call
697 if (map
!= NULL
&& iomc
->wrapfunc
!= NULL
)
698 error
= iomc
->wrapfunc(fp
, com
, ocom
, data
, cred
);
700 error
= fo_ioctl(fp
, com
, data
, cred
);
702 * Copy any data to user, size was
703 * already set and checked above.
705 if (error
== 0 && (com
& IOC_OUT
) != 0 && size
!= 0)
706 error
= copyout(data
, uspc_data
, (u_int
)size
);
710 kfree(memp
, M_IOCTLOPS
);
717 mapped_ioctl_register_handler(struct ioctl_map_handler
*he
)
719 struct ioctl_map_entry
*ne
;
721 KKASSERT(he
!= NULL
&& he
->map
!= NULL
&& he
->cmd_ranges
!= NULL
&&
722 he
->subsys
!= NULL
&& *he
->subsys
!= '\0');
724 ne
= kmalloc(sizeof(struct ioctl_map_entry
), M_IOCTLMAP
, M_WAITOK
);
726 ne
->subsys
= he
->subsys
;
727 ne
->cmd_ranges
= he
->cmd_ranges
;
729 LIST_INSERT_HEAD(&he
->map
->mapping
, ne
, entries
);
735 mapped_ioctl_unregister_handler(struct ioctl_map_handler
*he
)
737 struct ioctl_map_entry
*ne
;
739 KKASSERT(he
!= NULL
&& he
->map
!= NULL
&& he
->cmd_ranges
!= NULL
);
741 LIST_FOREACH(ne
, &he
->map
->mapping
, entries
) {
742 if (ne
->cmd_ranges
!= he
->cmd_ranges
)
744 LIST_REMOVE(ne
, entries
);
745 kfree(ne
, M_IOCTLMAP
);
751 static int nselcoll
; /* Select collisions since boot */
753 SYSCTL_INT(_kern
, OID_AUTO
, nselcoll
, CTLFLAG_RD
, &nselcoll
, 0, "");
756 * Select system call.
759 sys_select(struct select_args
*uap
)
762 struct timeval
*ktvp
;
766 * Get timeout if any.
768 if (uap
->tv
!= NULL
) {
769 error
= copyin(uap
->tv
, &ktv
, sizeof (ktv
));
772 error
= itimerfix(&ktv
);
783 error
= doselect(uap
->nd
, uap
->in
, uap
->ou
, uap
->ex
, ktvp
,
784 &uap
->sysmsg_result
);
791 * Pselect system call.
794 sys_pselect(struct pselect_args
*uap
)
796 struct thread
*td
= curthread
;
797 struct lwp
*lp
= td
->td_lwp
;
800 struct timeval
*ktvp
;
805 * Get timeout if any and convert it.
806 * Round up during conversion to avoid timeout going off early.
808 if (uap
->ts
!= NULL
) {
809 error
= copyin(uap
->ts
, &kts
, sizeof (kts
));
812 ktv
.tv_sec
= kts
.tv_sec
;
813 ktv
.tv_usec
= (kts
.tv_nsec
+ 999) / 1000;
814 error
= itimerfix(&ktv
);
823 * Install temporary signal mask if any provided.
825 if (uap
->sigmask
!= NULL
) {
826 error
= copyin(uap
->sigmask
, &sigmask
, sizeof(sigmask
));
829 lp
->lwp_oldsigmask
= lp
->lwp_sigmask
;
830 SIG_CANTMASK(sigmask
);
831 lp
->lwp_sigmask
= sigmask
;
837 error
= doselect(uap
->nd
, uap
->in
, uap
->ou
, uap
->ex
, ktvp
,
838 &uap
->sysmsg_result
);
840 if (uap
->sigmask
!= NULL
) {
841 /* doselect() responsible for turning ERESTART into EINTR */
842 KKASSERT(error
!= ERESTART
);
843 if (error
== EINTR
) {
845 * We can't restore the previous signal mask now
846 * because it could block the signal that interrupted
847 * us. So make a note to restore it after executing
850 lp
->lwp_flag
|= LWP_OLDMASK
;
853 * No handler to run. Restore previous mask immediately.
855 lp
->lwp_sigmask
= lp
->lwp_oldsigmask
;
863 * Common code for sys_select() and sys_pselect().
865 * in, out and ex are userland pointers. tv must point to validated
866 * kernel-side timeout value or NULL for infinite timeout. res must
867 * point to syscall return value.
870 doselect(int nd
, fd_set
*in
, fd_set
*ou
, fd_set
*ex
, struct timeval
*tv
,
873 struct lwp
*lp
= curthread
->td_lwp
;
874 struct proc
*p
= curproc
;
877 * The magic 2048 here is chosen to be just enough for FD_SETSIZE
878 * infds with the new FD_SETSIZE of 1024, and more than enough for
879 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE
882 fd_mask s_selbits
[howmany(2048, NFDBITS
)];
883 fd_mask
*ibits
[3], *obits
[3], *selbits
, *sbp
;
884 struct timeval atv
, rtv
, ttv
;
885 int ncoll
, error
, timo
;
886 u_int nbufbytes
, ncpbytes
, nfdbits
;
890 if (nd
> p
->p_fd
->fd_nfiles
)
891 nd
= p
->p_fd
->fd_nfiles
; /* forgiving; slightly wrong */
894 * Allocate just enough bits for the non-null fd_sets. Use the
895 * preallocated auto buffer if possible.
897 nfdbits
= roundup(nd
, NFDBITS
);
898 ncpbytes
= nfdbits
/ NBBY
;
901 nbufbytes
+= 2 * ncpbytes
;
903 nbufbytes
+= 2 * ncpbytes
;
905 nbufbytes
+= 2 * ncpbytes
;
906 if (nbufbytes
<= sizeof s_selbits
)
907 selbits
= &s_selbits
[0];
909 selbits
= kmalloc(nbufbytes
, M_SELECT
, M_WAITOK
);
912 * Assign pointers into the bit buffers and fetch the input bits.
913 * Put the output buffers together so that they can be bzeroed
917 #define getbits(name, x) \
922 ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp; \
924 sbp += ncpbytes / sizeof *sbp; \
925 error = copyin(name, ibits[x], ncpbytes); \
935 bzero(selbits
, nbufbytes
/ 2);
939 getmicrouptime(&rtv
);
940 timevaladd(&atv
, &rtv
);
948 lp
->lwp_flag
|= LWP_SELECT
;
949 error
= selscan(p
, ibits
, obits
, nd
, res
);
952 if (atv
.tv_sec
|| atv
.tv_usec
) {
953 getmicrouptime(&rtv
);
954 if (timevalcmp(&rtv
, &atv
, >=))
957 timevalsub(&ttv
, &rtv
);
958 timo
= ttv
.tv_sec
> 24 * 60 * 60 ?
959 24 * 60 * 60 * hz
: tvtohz_high(&ttv
);
962 if ((lp
->lwp_flag
& LWP_SELECT
) == 0 || nselcoll
!= ncoll
) {
966 lp
->lwp_flag
&= ~LWP_SELECT
;
968 error
= tsleep((caddr_t
)&selwait
, PCATCH
, "select", timo
);
974 lp
->lwp_flag
&= ~LWP_SELECT
;
975 /* select is not restarted after signals... */
976 if (error
== ERESTART
)
978 if (error
== EWOULDBLOCK
)
980 #define putbits(name, x) \
981 if (name && (error2 = copyout(obits[x], name, ncpbytes))) \
991 if (selbits
!= &s_selbits
[0])
992 kfree(selbits
, M_SELECT
);
997 selscan(struct proc
*p
, fd_mask
**ibits
, fd_mask
**obits
, int nfd
, int *res
)
1003 /* Note: backend also returns POLLHUP/POLLERR if appropriate. */
1004 static int flag
[3] = { POLLRDNORM
, POLLWRNORM
, POLLRDBAND
};
1006 for (msk
= 0; msk
< 3; msk
++) {
1007 if (ibits
[msk
] == NULL
)
1009 for (i
= 0; i
< nfd
; i
+= NFDBITS
) {
1010 bits
= ibits
[msk
][i
/NFDBITS
];
1011 /* ffs(int mask) not portable, fd_mask is long */
1012 for (fd
= i
; bits
&& fd
< nfd
; fd
++, bits
>>= 1) {
1015 fp
= holdfp(p
->p_fd
, fd
, -1);
1018 if (fo_poll(fp
, flag
[msk
], fp
->f_cred
)) {
1019 obits
[msk
][(fd
)/NFDBITS
] |=
1020 ((fd_mask
)1 << ((fd
) % NFDBITS
));
1035 sys_poll(struct poll_args
*uap
)
1037 struct pollfd
*bits
;
1038 struct pollfd smallbits
[32];
1039 struct timeval atv
, rtv
, ttv
;
1040 int ncoll
, error
= 0, timo
;
1043 struct lwp
*lp
= curthread
->td_lwp
;
1044 struct proc
*p
= curproc
;
1048 * This is kinda bogus. We have fd limits, but that is not
1049 * really related to the size of the pollfd array. Make sure
1050 * we let the process use at least FD_SETSIZE entries and at
1051 * least enough for the current limits. We want to be reasonably
1052 * safe, but not overly restrictive.
1054 if (nfds
> p
->p_rlimit
[RLIMIT_NOFILE
].rlim_cur
&& nfds
> FD_SETSIZE
)
1056 ni
= nfds
* sizeof(struct pollfd
);
1057 if (ni
> sizeof(smallbits
))
1058 bits
= kmalloc(ni
, M_TEMP
, M_WAITOK
);
1061 error
= copyin(uap
->fds
, bits
, ni
);
1064 if (uap
->timeout
!= INFTIM
) {
1065 atv
.tv_sec
= uap
->timeout
/ 1000;
1066 atv
.tv_usec
= (uap
->timeout
% 1000) * 1000;
1067 if (itimerfix(&atv
)) {
1071 getmicrouptime(&rtv
);
1072 timevaladd(&atv
, &rtv
);
1080 lp
->lwp_flag
|= LWP_SELECT
;
1081 error
= pollscan(p
, bits
, nfds
, &uap
->sysmsg_result
);
1082 if (error
|| uap
->sysmsg_result
)
1084 if (atv
.tv_sec
|| atv
.tv_usec
) {
1085 getmicrouptime(&rtv
);
1086 if (timevalcmp(&rtv
, &atv
, >=))
1089 timevalsub(&ttv
, &rtv
);
1090 timo
= ttv
.tv_sec
> 24 * 60 * 60 ?
1091 24 * 60 * 60 * hz
: tvtohz_high(&ttv
);
1094 if ((lp
->lwp_flag
& LWP_SELECT
) == 0 || nselcoll
!= ncoll
) {
1098 lp
->lwp_flag
&= ~LWP_SELECT
;
1099 error
= tsleep((caddr_t
)&selwait
, PCATCH
, "poll", timo
);
1104 lp
->lwp_flag
&= ~LWP_SELECT
;
1105 /* poll is not restarted after signals... */
1106 if (error
== ERESTART
)
1108 if (error
== EWOULDBLOCK
)
1111 error
= copyout(bits
, uap
->fds
, ni
);
1116 if (ni
> sizeof(smallbits
))
1117 kfree(bits
, M_TEMP
);
1122 pollscan(struct proc
*p
, struct pollfd
*fds
, u_int nfd
, int *res
)
1128 for (i
= 0; i
< nfd
; i
++, fds
++) {
1129 if (fds
->fd
>= p
->p_fd
->fd_nfiles
) {
1130 fds
->revents
= POLLNVAL
;
1132 } else if (fds
->fd
< 0) {
1135 fp
= holdfp(p
->p_fd
, fds
->fd
, -1);
1137 fds
->revents
= POLLNVAL
;
1141 * Note: backend also returns POLLHUP and
1142 * POLLERR if appropriate.
1144 fds
->revents
= fo_poll(fp
, fds
->events
,
1146 if (fds
->revents
!= 0)
1157 * OpenBSD poll system call.
1158 * XXX this isn't quite a true representation.. OpenBSD uses select ops.
1161 sys_openbsd_poll(struct openbsd_poll_args
*uap
)
1163 return (sys_poll((struct poll_args
*)uap
));
1168 seltrue(cdev_t dev
, int events
)
1170 return (events
& (POLLIN
| POLLOUT
| POLLRDNORM
| POLLWRNORM
));
1174 * Record a select request. A global wait must be used since a process/thread
1175 * might go away after recording its request.
1178 selrecord(struct thread
*selector
, struct selinfo
*sip
)
1181 struct lwp
*lp
= NULL
;
1183 if (selector
->td_lwp
== NULL
)
1184 panic("selrecord: thread needs a process");
1186 if (sip
->si_pid
== selector
->td_proc
->p_pid
&&
1187 sip
->si_tid
== selector
->td_lwp
->lwp_tid
)
1189 if (sip
->si_pid
&& (p
= pfind(sip
->si_pid
)))
1190 lp
= lwp_rb_tree_RB_LOOKUP(&p
->p_lwp_tree
, sip
->si_tid
);
1191 if (lp
!= NULL
&& lp
->lwp_wchan
== (caddr_t
)&selwait
) {
1192 sip
->si_flags
|= SI_COLL
;
1194 sip
->si_pid
= selector
->td_proc
->p_pid
;
1195 sip
->si_tid
= selector
->td_lwp
->lwp_tid
;
1200 * Do a wakeup when a selectable event occurs.
1203 selwakeup(struct selinfo
*sip
)
1206 struct lwp
*lp
= NULL
;
1208 if (sip
->si_pid
== 0)
1210 if (sip
->si_flags
& SI_COLL
) {
1212 sip
->si_flags
&= ~SI_COLL
;
1213 wakeup((caddr_t
)&selwait
); /* YYY fixable */
1215 p
= pfind(sip
->si_pid
);
1219 lp
= lwp_rb_tree_RB_LOOKUP(&p
->p_lwp_tree
, sip
->si_tid
);
1224 if (lp
->lwp_wchan
== (caddr_t
)&selwait
) {
1226 * Flag the process to break the tsleep when
1227 * setrunnable is called, but only call setrunnable
1228 * here if the process is not in a stopped state.
1230 lp
->lwp_flag
|= LWP_BREAKTSLEEP
;
1231 if (p
->p_stat
!= SSTOP
)
1233 } else if (lp
->lwp_flag
& LWP_SELECT
) {
1234 lp
->lwp_flag
&= ~LWP_SELECT
;