2 * Copyright (c) 1982, 1986, 1989, 1993
3 * The Regents of the University of California. All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 * must display the following acknowledgement:
20 * This product includes software developed by the University of
21 * California, Berkeley and its contributors.
22 * 4. Neither the name of the University nor the names of its contributors
23 * may be used to endorse or promote products derived from this software
24 * without specific prior written permission.
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38 * @(#)sys_generic.c 8.5 (Berkeley) 1/21/94
39 * $FreeBSD: src/sys/kern/sys_generic.c,v 1.55.2.10 2001/03/17 10:39:32 peter Exp $
40 * $DragonFly: src/sys/kern/sys_generic.c,v 1.48 2008/04/14 12:01:50 dillon Exp $
43 #include "opt_ktrace.h"
45 #include <sys/param.h>
46 #include <sys/systm.h>
47 #include <sys/sysproto.h>
48 #include <sys/filedesc.h>
49 #include <sys/filio.h>
50 #include <sys/fcntl.h>
53 #include <sys/signalvar.h>
54 #include <sys/socketvar.h>
56 #include <sys/kernel.h>
57 #include <sys/kern_syscall.h>
58 #include <sys/malloc.h>
59 #include <sys/mapped_ioctl.h>
61 #include <sys/queue.h>
62 #include <sys/resourcevar.h>
63 #include <sys/sysctl.h>
64 #include <sys/sysent.h>
67 #include <sys/ktrace.h>
70 #include <vm/vm_page.h>
71 #include <sys/file2.h>
73 #include <machine/limits.h>
75 static MALLOC_DEFINE(M_IOCTLOPS
, "ioctlops", "ioctl data buffer");
76 static MALLOC_DEFINE(M_IOCTLMAP
, "ioctlmap", "mapped ioctl handler buffer");
77 static MALLOC_DEFINE(M_SELECT
, "select", "select() buffer");
78 MALLOC_DEFINE(M_IOV
, "iov", "large iov's");
80 static int doselect(int nd
, fd_set
*in
, fd_set
*ou
, fd_set
*ex
,
81 struct timeval
*tv
, int *res
);
82 static int pollscan (struct proc
*, struct pollfd
*, u_int
, int *);
83 static int selscan (struct proc
*, fd_mask
**, fd_mask
**,
85 static int dofileread(int, struct file
*, struct uio
*, int, int *);
86 static int dofilewrite(int, struct file
*, struct uio
*, int, int *);
94 sys_read(struct read_args
*uap
)
96 struct thread
*td
= curthread
;
101 aiov
.iov_base
= uap
->buf
;
102 aiov
.iov_len
= uap
->nbyte
;
103 auio
.uio_iov
= &aiov
;
105 auio
.uio_offset
= -1;
106 auio
.uio_resid
= uap
->nbyte
;
107 auio
.uio_rw
= UIO_READ
;
108 auio
.uio_segflg
= UIO_USERSPACE
;
111 if (auio
.uio_resid
< 0)
114 error
= kern_preadv(uap
->fd
, &auio
, 0, &uap
->sysmsg_result
);
119 * Positioned (Pread) read system call
124 sys_extpread(struct extpread_args
*uap
)
126 struct thread
*td
= curthread
;
132 aiov
.iov_base
= uap
->buf
;
133 aiov
.iov_len
= uap
->nbyte
;
134 auio
.uio_iov
= &aiov
;
136 auio
.uio_offset
= uap
->offset
;
137 auio
.uio_resid
= uap
->nbyte
;
138 auio
.uio_rw
= UIO_READ
;
139 auio
.uio_segflg
= UIO_USERSPACE
;
142 flags
= uap
->flags
& O_FMASK
;
143 if (uap
->offset
!= (off_t
)-1)
146 if (auio
.uio_resid
< 0)
149 error
= kern_preadv(uap
->fd
, &auio
, flags
, &uap
->sysmsg_result
);
154 * Scatter read system call.
159 sys_readv(struct readv_args
*uap
)
161 struct thread
*td
= curthread
;
163 struct iovec aiov
[UIO_SMALLIOV
], *iov
= NULL
;
166 error
= iovec_copyin(uap
->iovp
, &iov
, aiov
, uap
->iovcnt
,
171 auio
.uio_iovcnt
= uap
->iovcnt
;
172 auio
.uio_offset
= -1;
173 auio
.uio_rw
= UIO_READ
;
174 auio
.uio_segflg
= UIO_USERSPACE
;
177 error
= kern_preadv(uap
->fd
, &auio
, 0, &uap
->sysmsg_result
);
179 iovec_free(&iov
, aiov
);
185 * Scatter positioned read system call.
190 sys_extpreadv(struct extpreadv_args
*uap
)
192 struct thread
*td
= curthread
;
194 struct iovec aiov
[UIO_SMALLIOV
], *iov
= NULL
;
198 error
= iovec_copyin(uap
->iovp
, &iov
, aiov
, uap
->iovcnt
,
203 auio
.uio_iovcnt
= uap
->iovcnt
;
204 auio
.uio_offset
= uap
->offset
;
205 auio
.uio_rw
= UIO_READ
;
206 auio
.uio_segflg
= UIO_USERSPACE
;
209 flags
= uap
->flags
& O_FMASK
;
210 if (uap
->offset
!= (off_t
)-1)
213 error
= kern_preadv(uap
->fd
, &auio
, flags
, &uap
->sysmsg_result
);
215 iovec_free(&iov
, aiov
);
223 kern_preadv(int fd
, struct uio
*auio
, int flags
, int *res
)
225 struct thread
*td
= curthread
;
226 struct proc
*p
= td
->td_proc
;
232 fp
= holdfp(p
->p_fd
, fd
, FREAD
);
235 if (flags
& O_FOFFSET
&& fp
->f_type
!= DTYPE_VNODE
) {
237 } else if (auio
->uio_resid
< 0) {
240 error
= dofileread(fd
, fp
, auio
, flags
, res
);
247 * Common code for readv and preadv that reads data in
248 * from a file using the passed in uio, offset, and flags.
250 * MPALMOSTSAFE - ktrace needs help
253 dofileread(int fd
, struct file
*fp
, struct uio
*auio
, int flags
, int *res
)
255 struct thread
*td
= curthread
;
259 struct iovec
*ktriov
= NULL
;
265 * if tracing, save a copy of iovec
267 if (KTRPOINT(td
, KTR_GENIO
)) {
268 int iovlen
= auio
->uio_iovcnt
* sizeof(struct iovec
);
270 MALLOC(ktriov
, struct iovec
*, iovlen
, M_TEMP
, M_WAITOK
);
271 bcopy((caddr_t
)auio
->uio_iov
, (caddr_t
)ktriov
, iovlen
);
275 len
= auio
->uio_resid
;
276 error
= fo_read(fp
, auio
, fp
->f_cred
, flags
);
278 if (auio
->uio_resid
!= len
&& (error
== ERESTART
||
279 error
== EINTR
|| error
== EWOULDBLOCK
))
283 if (ktriov
!= NULL
) {
285 ktruio
.uio_iov
= ktriov
;
286 ktruio
.uio_resid
= len
- auio
->uio_resid
;
288 ktrgenio(td
->td_lwp
, fd
, UIO_READ
, &ktruio
, error
);
291 FREE(ktriov
, M_TEMP
);
295 *res
= len
- auio
->uio_resid
;
306 sys_write(struct write_args
*uap
)
308 struct thread
*td
= curthread
;
313 aiov
.iov_base
= (void *)(uintptr_t)uap
->buf
;
314 aiov
.iov_len
= uap
->nbyte
;
315 auio
.uio_iov
= &aiov
;
317 auio
.uio_offset
= -1;
318 auio
.uio_resid
= uap
->nbyte
;
319 auio
.uio_rw
= UIO_WRITE
;
320 auio
.uio_segflg
= UIO_USERSPACE
;
323 if (auio
.uio_resid
< 0)
326 error
= kern_pwritev(uap
->fd
, &auio
, 0, &uap
->sysmsg_result
);
337 sys_extpwrite(struct extpwrite_args
*uap
)
339 struct thread
*td
= curthread
;
345 aiov
.iov_base
= (void *)(uintptr_t)uap
->buf
;
346 aiov
.iov_len
= uap
->nbyte
;
347 auio
.uio_iov
= &aiov
;
349 auio
.uio_offset
= uap
->offset
;
350 auio
.uio_resid
= uap
->nbyte
;
351 auio
.uio_rw
= UIO_WRITE
;
352 auio
.uio_segflg
= UIO_USERSPACE
;
355 flags
= uap
->flags
& O_FMASK
;
356 if (uap
->offset
!= (off_t
)-1)
359 if (auio
.uio_resid
< 0)
362 error
= kern_pwritev(uap
->fd
, &auio
, flags
, &uap
->sysmsg_result
);
371 sys_writev(struct writev_args
*uap
)
373 struct thread
*td
= curthread
;
375 struct iovec aiov
[UIO_SMALLIOV
], *iov
= NULL
;
378 error
= iovec_copyin(uap
->iovp
, &iov
, aiov
, uap
->iovcnt
,
383 auio
.uio_iovcnt
= uap
->iovcnt
;
384 auio
.uio_offset
= -1;
385 auio
.uio_rw
= UIO_WRITE
;
386 auio
.uio_segflg
= UIO_USERSPACE
;
389 error
= kern_pwritev(uap
->fd
, &auio
, 0, &uap
->sysmsg_result
);
391 iovec_free(&iov
, aiov
);
397 * Gather positioned write system call
402 sys_extpwritev(struct extpwritev_args
*uap
)
404 struct thread
*td
= curthread
;
406 struct iovec aiov
[UIO_SMALLIOV
], *iov
= NULL
;
410 error
= iovec_copyin(uap
->iovp
, &iov
, aiov
, uap
->iovcnt
,
415 auio
.uio_iovcnt
= uap
->iovcnt
;
416 auio
.uio_offset
= uap
->offset
;
417 auio
.uio_rw
= UIO_WRITE
;
418 auio
.uio_segflg
= UIO_USERSPACE
;
421 flags
= uap
->flags
& O_FMASK
;
422 if (uap
->offset
!= (off_t
)-1)
425 error
= kern_pwritev(uap
->fd
, &auio
, flags
, &uap
->sysmsg_result
);
427 iovec_free(&iov
, aiov
);
435 kern_pwritev(int fd
, struct uio
*auio
, int flags
, int *res
)
437 struct thread
*td
= curthread
;
438 struct proc
*p
= td
->td_proc
;
444 fp
= holdfp(p
->p_fd
, fd
, FWRITE
);
447 else if ((flags
& O_FOFFSET
) && fp
->f_type
!= DTYPE_VNODE
) {
450 error
= dofilewrite(fd
, fp
, auio
, flags
, res
);
458 * Common code for writev and pwritev that writes data to
459 * a file using the passed in uio, offset, and flags.
461 * MPALMOSTSAFE - ktrace needs help
464 dofilewrite(int fd
, struct file
*fp
, struct uio
*auio
, int flags
, int *res
)
466 struct thread
*td
= curthread
;
467 struct lwp
*lp
= td
->td_lwp
;
471 struct iovec
*ktriov
= NULL
;
477 * if tracing, save a copy of iovec and uio
479 if (KTRPOINT(td
, KTR_GENIO
)) {
480 int iovlen
= auio
->uio_iovcnt
* sizeof(struct iovec
);
482 MALLOC(ktriov
, struct iovec
*, iovlen
, M_TEMP
, M_WAITOK
);
483 bcopy((caddr_t
)auio
->uio_iov
, (caddr_t
)ktriov
, iovlen
);
487 len
= auio
->uio_resid
;
488 if (fp
->f_type
== DTYPE_VNODE
)
490 error
= fo_write(fp
, auio
, fp
->f_cred
, flags
);
492 if (auio
->uio_resid
!= len
&& (error
== ERESTART
||
493 error
== EINTR
|| error
== EWOULDBLOCK
))
495 /* Socket layer is responsible for issuing SIGPIPE. */
496 if (error
== EPIPE
) {
498 lwpsignal(lp
->lwp_proc
, lp
, SIGPIPE
);
503 if (ktriov
!= NULL
) {
505 ktruio
.uio_iov
= ktriov
;
506 ktruio
.uio_resid
= len
- auio
->uio_resid
;
508 ktrgenio(lp
, fd
, UIO_WRITE
, &ktruio
, error
);
511 FREE(ktriov
, M_TEMP
);
515 *res
= len
- auio
->uio_resid
;
525 sys_ioctl(struct ioctl_args
*uap
)
527 return(mapped_ioctl(uap
->fd
, uap
->com
, uap
->data
, NULL
));
530 struct ioctl_map_entry
{
532 struct ioctl_map_range
*cmd_ranges
;
533 LIST_ENTRY(ioctl_map_entry
) entries
;
537 * The true heart of all ioctl syscall handlers (native, emulation).
538 * If map != NULL, it will be searched for a matching entry for com,
539 * and appropriate conversions/conversion functions will be utilized.
542 mapped_ioctl(int fd
, u_long com
, caddr_t uspc_data
, struct ioctl_map
*map
)
544 struct thread
*td
= curthread
;
545 struct proc
*p
= td
->td_proc
;
548 struct ioctl_map_range
*iomc
= NULL
;
554 #define STK_PARAMS 128
556 char stkbuf
[STK_PARAMS
];
563 fp
= holdfp(p
->p_fd
, fd
, FREAD
|FWRITE
);
567 if (map
!= NULL
) { /* obey translation map */
569 struct ioctl_map_entry
*e
;
571 maskcmd
= com
& map
->mask
;
573 LIST_FOREACH(e
, &map
->mapping
, entries
) {
574 for (iomc
= e
->cmd_ranges
; iomc
->start
!= 0 ||
575 iomc
->maptocmd
!= 0 || iomc
->wrapfunc
!= NULL
||
576 iomc
->mapfunc
!= NULL
;
578 if (maskcmd
>= iomc
->start
&&
579 maskcmd
<= iomc
->end
)
583 /* Did we find a match? */
584 if (iomc
->start
!= 0 || iomc
->maptocmd
!= 0 ||
585 iomc
->wrapfunc
!= NULL
|| iomc
->mapfunc
!= NULL
)
590 (iomc
->start
== 0 && iomc
->maptocmd
== 0
591 && iomc
->wrapfunc
== NULL
&& iomc
->mapfunc
== NULL
)) {
592 kprintf("%s: 'ioctl' fd=%d, cmd=0x%lx ('%c',%d) not implemented\n",
593 map
->sys
, fd
, maskcmd
,
594 (int)((maskcmd
>> 8) & 0xff),
595 (int)(maskcmd
& 0xff));
601 * If it's a non-range one to one mapping, maptocmd should be
602 * correct. If it's a ranged one to one mapping, we pass the
603 * original value of com, and for a range mapped to a different
604 * range, we always need a mapping function to translate the
605 * ioctl to our native ioctl. Ex. 6500-65ff <-> 9500-95ff
607 if (iomc
->start
== iomc
->end
&& iomc
->maptocmd
== iomc
->maptoend
) {
608 com
= iomc
->maptocmd
;
609 } else if (iomc
->start
== iomc
->maptocmd
&& iomc
->end
== iomc
->maptoend
) {
610 if (iomc
->mapfunc
!= NULL
)
611 com
= iomc
->mapfunc(iomc
->start
, iomc
->end
,
612 iomc
->start
, iomc
->end
,
615 if (iomc
->mapfunc
!= NULL
) {
616 com
= iomc
->mapfunc(iomc
->start
, iomc
->end
,
617 iomc
->maptocmd
, iomc
->maptoend
,
620 kprintf("%s: Invalid mapping for fd=%d, cmd=%#lx ('%c',%d)\n",
621 map
->sys
, fd
, maskcmd
,
622 (int)((maskcmd
>> 8) & 0xff),
623 (int)(maskcmd
& 0xff));
632 error
= fclrfdflags(p
->p_fd
, fd
, UF_EXCLOSE
);
635 error
= fsetfdflags(p
->p_fd
, fd
, UF_EXCLOSE
);
640 * Interpret high order word to find amount of data to be
641 * copied to/from the user's address space.
643 size
= IOCPARM_LEN(com
);
644 if (size
> IOCPARM_MAX
) {
650 if (size
> sizeof (ubuf
.stkbuf
)) {
651 memp
= kmalloc(size
, M_IOCTLOPS
, M_WAITOK
);
656 if ((com
& IOC_IN
) != 0) {
658 error
= copyin(uspc_data
, data
, (u_int
)size
);
661 kfree(memp
, M_IOCTLOPS
);
665 *(caddr_t
*)data
= uspc_data
;
667 } else if ((com
& IOC_OUT
) != 0 && size
) {
669 * Zero the buffer so the user always
670 * gets back something deterministic.
673 } else if ((com
& IOC_VOID
) != 0) {
674 *(caddr_t
*)data
= uspc_data
;
679 if ((tmp
= *(int *)data
))
680 fp
->f_flag
|= FNONBLOCK
;
682 fp
->f_flag
&= ~FNONBLOCK
;
687 if ((tmp
= *(int *)data
))
688 fp
->f_flag
|= FASYNC
;
690 fp
->f_flag
&= ~FASYNC
;
691 error
= fo_ioctl(fp
, FIOASYNC
, (caddr_t
)&tmp
, cred
);
696 * If there is a override function,
697 * call it instead of directly routing the call
699 if (map
!= NULL
&& iomc
->wrapfunc
!= NULL
)
700 error
= iomc
->wrapfunc(fp
, com
, ocom
, data
, cred
);
702 error
= fo_ioctl(fp
, com
, data
, cred
);
704 * Copy any data to user, size was
705 * already set and checked above.
707 if (error
== 0 && (com
& IOC_OUT
) != 0 && size
!= 0)
708 error
= copyout(data
, uspc_data
, (u_int
)size
);
712 kfree(memp
, M_IOCTLOPS
);
719 mapped_ioctl_register_handler(struct ioctl_map_handler
*he
)
721 struct ioctl_map_entry
*ne
;
723 KKASSERT(he
!= NULL
&& he
->map
!= NULL
&& he
->cmd_ranges
!= NULL
&&
724 he
->subsys
!= NULL
&& *he
->subsys
!= '\0');
726 ne
= kmalloc(sizeof(struct ioctl_map_entry
), M_IOCTLMAP
, M_WAITOK
);
728 ne
->subsys
= he
->subsys
;
729 ne
->cmd_ranges
= he
->cmd_ranges
;
731 LIST_INSERT_HEAD(&he
->map
->mapping
, ne
, entries
);
737 mapped_ioctl_unregister_handler(struct ioctl_map_handler
*he
)
739 struct ioctl_map_entry
*ne
;
741 KKASSERT(he
!= NULL
&& he
->map
!= NULL
&& he
->cmd_ranges
!= NULL
);
743 LIST_FOREACH(ne
, &he
->map
->mapping
, entries
) {
744 if (ne
->cmd_ranges
!= he
->cmd_ranges
)
746 LIST_REMOVE(ne
, entries
);
747 kfree(ne
, M_IOCTLMAP
);
753 static int nselcoll
; /* Select collisions since boot */
755 SYSCTL_INT(_kern
, OID_AUTO
, nselcoll
, CTLFLAG_RD
, &nselcoll
, 0, "");
758 * Select system call.
761 sys_select(struct select_args
*uap
)
764 struct timeval
*ktvp
;
768 * Get timeout if any.
770 if (uap
->tv
!= NULL
) {
771 error
= copyin(uap
->tv
, &ktv
, sizeof (ktv
));
774 error
= itimerfix(&ktv
);
785 error
= doselect(uap
->nd
, uap
->in
, uap
->ou
, uap
->ex
, ktvp
,
786 &uap
->sysmsg_result
);
793 * Pselect system call.
796 sys_pselect(struct pselect_args
*uap
)
798 struct thread
*td
= curthread
;
799 struct lwp
*lp
= td
->td_lwp
;
802 struct timeval
*ktvp
;
807 * Get timeout if any and convert it.
808 * Round up during conversion to avoid timeout going off early.
810 if (uap
->ts
!= NULL
) {
811 error
= copyin(uap
->ts
, &kts
, sizeof (kts
));
814 ktv
.tv_sec
= kts
.tv_sec
;
815 ktv
.tv_usec
= (kts
.tv_nsec
+ 999) / 1000;
816 error
= itimerfix(&ktv
);
825 * Install temporary signal mask if any provided.
827 if (uap
->sigmask
!= NULL
) {
828 error
= copyin(uap
->sigmask
, &sigmask
, sizeof(sigmask
));
831 lp
->lwp_oldsigmask
= lp
->lwp_sigmask
;
832 SIG_CANTMASK(sigmask
);
833 lp
->lwp_sigmask
= sigmask
;
839 error
= doselect(uap
->nd
, uap
->in
, uap
->ou
, uap
->ex
, ktvp
,
840 &uap
->sysmsg_result
);
842 if (uap
->sigmask
!= NULL
) {
843 /* doselect() responsible for turning ERESTART into EINTR */
844 KKASSERT(error
!= ERESTART
);
845 if (error
== EINTR
) {
847 * We can't restore the previous signal mask now
848 * because it could block the signal that interrupted
849 * us. So make a note to restore it after executing
852 lp
->lwp_flag
|= LWP_OLDMASK
;
855 * No handler to run. Restore previous mask immediately.
857 lp
->lwp_sigmask
= lp
->lwp_oldsigmask
;
865 * Common code for sys_select() and sys_pselect().
867 * in, out and ex are userland pointers. tv must point to validated
868 * kernel-side timeout value or NULL for infinite timeout. res must
869 * point to syscall return value.
872 doselect(int nd
, fd_set
*in
, fd_set
*ou
, fd_set
*ex
, struct timeval
*tv
,
875 struct lwp
*lp
= curthread
->td_lwp
;
876 struct proc
*p
= curproc
;
879 * The magic 2048 here is chosen to be just enough for FD_SETSIZE
880 * infds with the new FD_SETSIZE of 1024, and more than enough for
881 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE
884 fd_mask s_selbits
[howmany(2048, NFDBITS
)];
885 fd_mask
*ibits
[3], *obits
[3], *selbits
, *sbp
;
886 struct timeval atv
, rtv
, ttv
;
887 int ncoll
, error
, timo
;
888 u_int nbufbytes
, ncpbytes
, nfdbits
;
892 if (nd
> p
->p_fd
->fd_nfiles
)
893 nd
= p
->p_fd
->fd_nfiles
; /* forgiving; slightly wrong */
896 * Allocate just enough bits for the non-null fd_sets. Use the
897 * preallocated auto buffer if possible.
899 nfdbits
= roundup(nd
, NFDBITS
);
900 ncpbytes
= nfdbits
/ NBBY
;
903 nbufbytes
+= 2 * ncpbytes
;
905 nbufbytes
+= 2 * ncpbytes
;
907 nbufbytes
+= 2 * ncpbytes
;
908 if (nbufbytes
<= sizeof s_selbits
)
909 selbits
= &s_selbits
[0];
911 selbits
= kmalloc(nbufbytes
, M_SELECT
, M_WAITOK
);
914 * Assign pointers into the bit buffers and fetch the input bits.
915 * Put the output buffers together so that they can be bzeroed
919 #define getbits(name, x) \
924 ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp; \
926 sbp += ncpbytes / sizeof *sbp; \
927 error = copyin(name, ibits[x], ncpbytes); \
937 bzero(selbits
, nbufbytes
/ 2);
941 getmicrouptime(&rtv
);
942 timevaladd(&atv
, &rtv
);
950 lp
->lwp_flag
|= LWP_SELECT
;
951 error
= selscan(p
, ibits
, obits
, nd
, res
);
954 if (atv
.tv_sec
|| atv
.tv_usec
) {
955 getmicrouptime(&rtv
);
956 if (timevalcmp(&rtv
, &atv
, >=))
959 timevalsub(&ttv
, &rtv
);
960 timo
= ttv
.tv_sec
> 24 * 60 * 60 ?
961 24 * 60 * 60 * hz
: tvtohz_high(&ttv
);
964 if ((lp
->lwp_flag
& LWP_SELECT
) == 0 || nselcoll
!= ncoll
) {
968 lp
->lwp_flag
&= ~LWP_SELECT
;
970 error
= tsleep((caddr_t
)&selwait
, PCATCH
, "select", timo
);
976 lp
->lwp_flag
&= ~LWP_SELECT
;
977 /* select is not restarted after signals... */
978 if (error
== ERESTART
)
980 if (error
== EWOULDBLOCK
)
982 #define putbits(name, x) \
983 if (name && (error2 = copyout(obits[x], name, ncpbytes))) \
993 if (selbits
!= &s_selbits
[0])
994 kfree(selbits
, M_SELECT
);
999 selscan(struct proc
*p
, fd_mask
**ibits
, fd_mask
**obits
, int nfd
, int *res
)
1005 /* Note: backend also returns POLLHUP/POLLERR if appropriate. */
1006 static int flag
[3] = { POLLRDNORM
, POLLWRNORM
, POLLRDBAND
};
1008 for (msk
= 0; msk
< 3; msk
++) {
1009 if (ibits
[msk
] == NULL
)
1011 for (i
= 0; i
< nfd
; i
+= NFDBITS
) {
1012 bits
= ibits
[msk
][i
/NFDBITS
];
1013 /* ffs(int mask) not portable, fd_mask is long */
1014 for (fd
= i
; bits
&& fd
< nfd
; fd
++, bits
>>= 1) {
1017 fp
= holdfp(p
->p_fd
, fd
, -1);
1020 if (fo_poll(fp
, flag
[msk
], fp
->f_cred
)) {
1021 obits
[msk
][(fd
)/NFDBITS
] |=
1022 ((fd_mask
)1 << ((fd
) % NFDBITS
));
1037 sys_poll(struct poll_args
*uap
)
1039 struct pollfd
*bits
;
1040 struct pollfd smallbits
[32];
1041 struct timeval atv
, rtv
, ttv
;
1042 int ncoll
, error
= 0, timo
;
1045 struct lwp
*lp
= curthread
->td_lwp
;
1046 struct proc
*p
= curproc
;
1050 * This is kinda bogus. We have fd limits, but that is not
1051 * really related to the size of the pollfd array. Make sure
1052 * we let the process use at least FD_SETSIZE entries and at
1053 * least enough for the current limits. We want to be reasonably
1054 * safe, but not overly restrictive.
1056 if (nfds
> p
->p_rlimit
[RLIMIT_NOFILE
].rlim_cur
&& nfds
> FD_SETSIZE
)
1058 ni
= nfds
* sizeof(struct pollfd
);
1059 if (ni
> sizeof(smallbits
))
1060 bits
= kmalloc(ni
, M_TEMP
, M_WAITOK
);
1063 error
= copyin(uap
->fds
, bits
, ni
);
1066 if (uap
->timeout
!= INFTIM
) {
1067 atv
.tv_sec
= uap
->timeout
/ 1000;
1068 atv
.tv_usec
= (uap
->timeout
% 1000) * 1000;
1069 if (itimerfix(&atv
)) {
1073 getmicrouptime(&rtv
);
1074 timevaladd(&atv
, &rtv
);
1082 lp
->lwp_flag
|= LWP_SELECT
;
1083 error
= pollscan(p
, bits
, nfds
, &uap
->sysmsg_result
);
1084 if (error
|| uap
->sysmsg_result
)
1086 if (atv
.tv_sec
|| atv
.tv_usec
) {
1087 getmicrouptime(&rtv
);
1088 if (timevalcmp(&rtv
, &atv
, >=))
1091 timevalsub(&ttv
, &rtv
);
1092 timo
= ttv
.tv_sec
> 24 * 60 * 60 ?
1093 24 * 60 * 60 * hz
: tvtohz_high(&ttv
);
1096 if ((lp
->lwp_flag
& LWP_SELECT
) == 0 || nselcoll
!= ncoll
) {
1100 lp
->lwp_flag
&= ~LWP_SELECT
;
1101 error
= tsleep((caddr_t
)&selwait
, PCATCH
, "poll", timo
);
1106 lp
->lwp_flag
&= ~LWP_SELECT
;
1107 /* poll is not restarted after signals... */
1108 if (error
== ERESTART
)
1110 if (error
== EWOULDBLOCK
)
1113 error
= copyout(bits
, uap
->fds
, ni
);
1118 if (ni
> sizeof(smallbits
))
1119 kfree(bits
, M_TEMP
);
1124 pollscan(struct proc
*p
, struct pollfd
*fds
, u_int nfd
, int *res
)
1130 for (i
= 0; i
< nfd
; i
++, fds
++) {
1131 if (fds
->fd
>= p
->p_fd
->fd_nfiles
) {
1132 fds
->revents
= POLLNVAL
;
1134 } else if (fds
->fd
< 0) {
1137 fp
= holdfp(p
->p_fd
, fds
->fd
, -1);
1139 fds
->revents
= POLLNVAL
;
1143 * Note: backend also returns POLLHUP and
1144 * POLLERR if appropriate.
1146 fds
->revents
= fo_poll(fp
, fds
->events
,
1148 if (fds
->revents
!= 0)
1159 * OpenBSD poll system call.
1160 * XXX this isn't quite a true representation.. OpenBSD uses select ops.
1163 sys_openbsd_poll(struct openbsd_poll_args
*uap
)
1165 return (sys_poll((struct poll_args
*)uap
));
1170 seltrue(cdev_t dev
, int events
)
1172 return (events
& (POLLIN
| POLLOUT
| POLLRDNORM
| POLLWRNORM
));
1176 * Record a select request. A global wait must be used since a process/thread
1177 * might go away after recording its request.
1180 selrecord(struct thread
*selector
, struct selinfo
*sip
)
1183 struct lwp
*lp
= NULL
;
1185 if (selector
->td_lwp
== NULL
)
1186 panic("selrecord: thread needs a process");
1188 if (sip
->si_pid
== selector
->td_proc
->p_pid
&&
1189 sip
->si_tid
== selector
->td_lwp
->lwp_tid
)
1191 if (sip
->si_pid
&& (p
= pfind(sip
->si_pid
)))
1192 lp
= lwp_rb_tree_RB_LOOKUP(&p
->p_lwp_tree
, sip
->si_tid
);
1193 if (lp
!= NULL
&& lp
->lwp_wchan
== (caddr_t
)&selwait
) {
1194 sip
->si_flags
|= SI_COLL
;
1196 sip
->si_pid
= selector
->td_proc
->p_pid
;
1197 sip
->si_tid
= selector
->td_lwp
->lwp_tid
;
1202 * Do a wakeup when a selectable event occurs.
1205 selwakeup(struct selinfo
*sip
)
1208 struct lwp
*lp
= NULL
;
1210 if (sip
->si_pid
== 0)
1212 if (sip
->si_flags
& SI_COLL
) {
1214 sip
->si_flags
&= ~SI_COLL
;
1215 wakeup((caddr_t
)&selwait
); /* YYY fixable */
1217 p
= pfind(sip
->si_pid
);
1221 lp
= lwp_rb_tree_RB_LOOKUP(&p
->p_lwp_tree
, sip
->si_tid
);
1226 if (lp
->lwp_wchan
== (caddr_t
)&selwait
) {
1228 * Flag the process to break the tsleep when
1229 * setrunnable is called, but only call setrunnable
1230 * here if the process is not in a stopped state.
1232 lp
->lwp_flag
|= LWP_BREAKTSLEEP
;
1233 if (p
->p_stat
!= SSTOP
)
1235 } else if (lp
->lwp_flag
& LWP_SELECT
) {
1236 lp
->lwp_flag
&= ~LWP_SELECT
;