Mark up sysctl node with Li, like in sysctl(7).
[netbsd-mini2440.git] / sys / kern / sys_select.c
blob911918ab815c875254c000865bae49b0a2d820fc
1 /* $NetBSD: sys_select.c,v 1.13 2009/03/21 13:11:14 ad Exp $ */
3 /*-
4 * Copyright (c) 2007, 2008, 2009 The NetBSD Foundation, Inc.
5 * All rights reserved.
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Andrew Doran.
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
33 * Copyright (c) 1982, 1986, 1989, 1993
34 * The Regents of the University of California. All rights reserved.
35 * (c) UNIX System Laboratories, Inc.
36 * All or some portions of this file are derived from material licensed
37 * to the University of California by American Telephone and Telegraph
38 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
39 * the permission of UNIX System Laboratories, Inc.
41 * Redistribution and use in source and binary forms, with or without
42 * modification, are permitted provided that the following conditions
43 * are met:
44 * 1. Redistributions of source code must retain the above copyright
45 * notice, this list of conditions and the following disclaimer.
46 * 2. Redistributions in binary form must reproduce the above copyright
47 * notice, this list of conditions and the following disclaimer in the
48 * documentation and/or other materials provided with the distribution.
49 * 3. Neither the name of the University nor the names of its contributors
50 * may be used to endorse or promote products derived from this software
51 * without specific prior written permission.
53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63 * SUCH DAMAGE.
65 * @(#)sys_generic.c 8.9 (Berkeley) 2/14/95
69 * System calls relating to files.
72 #include <sys/cdefs.h>
73 __KERNEL_RCSID(0, "$NetBSD: sys_select.c,v 1.13 2009/03/21 13:11:14 ad Exp $");
75 #include <sys/param.h>
76 #include <sys/systm.h>
77 #include <sys/filedesc.h>
78 #include <sys/ioctl.h>
79 #include <sys/file.h>
80 #include <sys/proc.h>
81 #include <sys/socketvar.h>
82 #include <sys/signalvar.h>
83 #include <sys/uio.h>
84 #include <sys/kernel.h>
85 #include <sys/stat.h>
86 #include <sys/poll.h>
87 #include <sys/vnode.h>
88 #include <sys/mount.h>
89 #include <sys/syscallargs.h>
90 #include <sys/cpu.h>
91 #include <sys/atomic.h>
92 #include <sys/socketvar.h>
93 #include <sys/sleepq.h>
95 /* Flags for lwp::l_selflag. */
96 #define SEL_RESET 0 /* awoken, interrupted, or not yet polling */
97 #define SEL_SCANNING 1 /* polling descriptors */
98 #define SEL_BLOCKING 2 /* about to block on select_cv */
100 /* Per-CPU state for select()/poll(). */
101 #if MAXCPUS > 32
102 #error adjust this code
103 #endif
104 typedef struct selcpu {
105 kmutex_t *sc_lock;
106 sleepq_t sc_sleepq;
107 int sc_ncoll;
108 uint32_t sc_mask;
109 } selcpu_t;
111 static int selscan(lwp_t *, fd_mask *, fd_mask *, int, register_t *);
112 static int pollscan(lwp_t *, struct pollfd *, int, register_t *);
113 static void selclear(void);
115 static syncobj_t select_sobj = {
116 SOBJ_SLEEPQ_FIFO,
117 sleepq_unsleep,
118 sleepq_changepri,
119 sleepq_lendpri,
120 syncobj_noowner,
124 * Select system call.
127 sys___pselect50(struct lwp *l, const struct sys___pselect50_args *uap,
128 register_t *retval)
130 /* {
131 syscallarg(int) nd;
132 syscallarg(fd_set *) in;
133 syscallarg(fd_set *) ou;
134 syscallarg(fd_set *) ex;
135 syscallarg(const struct timespec *) ts;
136 syscallarg(sigset_t *) mask;
137 } */
138 struct timespec ats, *ts = NULL;
139 sigset_t amask, *mask = NULL;
140 int error;
142 if (SCARG(uap, ts)) {
143 error = copyin(SCARG(uap, ts), &ats, sizeof(ats));
144 if (error)
145 return error;
146 ts = &ats;
148 if (SCARG(uap, mask) != NULL) {
149 error = copyin(SCARG(uap, mask), &amask, sizeof(amask));
150 if (error)
151 return error;
152 mask = &amask;
155 return selcommon(l, retval, SCARG(uap, nd), SCARG(uap, in),
156 SCARG(uap, ou), SCARG(uap, ex), ts, mask);
160 inittimeleft(struct timespec *ts, struct timespec *sleepts)
162 if (itimespecfix(ts))
163 return -1;
164 getnanouptime(sleepts);
165 return 0;
169 gettimeleft(struct timespec *ts, struct timespec *sleepts)
172 * We have to recalculate the timeout on every retry.
174 struct timespec sleptts;
176 * reduce ts by elapsed time
177 * based on monotonic time scale
179 getnanouptime(&sleptts);
180 timespecadd(ts, sleepts, ts);
181 timespecsub(ts, &sleptts, ts);
182 *sleepts = sleptts;
183 return tstohz(ts);
187 sys___select50(struct lwp *l, const struct sys___select50_args *uap,
188 register_t *retval)
190 /* {
191 syscallarg(int) nd;
192 syscallarg(fd_set *) in;
193 syscallarg(fd_set *) ou;
194 syscallarg(fd_set *) ex;
195 syscallarg(struct timeval *) tv;
196 } */
197 struct timeval atv;
198 struct timespec ats, *ts = NULL;
199 int error;
201 if (SCARG(uap, tv)) {
202 error = copyin(SCARG(uap, tv), (void *)&atv, sizeof(atv));
203 if (error)
204 return error;
205 TIMEVAL_TO_TIMESPEC(&atv, &ats);
206 ts = &ats;
209 return selcommon(l, retval, SCARG(uap, nd), SCARG(uap, in),
210 SCARG(uap, ou), SCARG(uap, ex), ts, NULL);
214 selcommon(lwp_t *l, register_t *retval, int nd, fd_set *u_in,
215 fd_set *u_ou, fd_set *u_ex, struct timespec *ts, sigset_t *mask)
217 char smallbits[howmany(FD_SETSIZE, NFDBITS) *
218 sizeof(fd_mask) * 6];
219 proc_t * const p = l->l_proc;
220 char *bits;
221 int ncoll, error, timo;
222 size_t ni;
223 sigset_t oldmask;
224 struct timespec sleepts;
225 selcpu_t *sc;
226 kmutex_t *lock;
228 error = 0;
229 if (nd < 0)
230 return (EINVAL);
231 if (nd > p->p_fd->fd_nfiles) {
232 /* forgiving; slightly wrong */
233 nd = p->p_fd->fd_nfiles;
235 ni = howmany(nd, NFDBITS) * sizeof(fd_mask);
236 if (ni * 6 > sizeof(smallbits)) {
237 bits = kmem_alloc(ni * 6, KM_SLEEP);
238 if (bits == NULL)
239 return ENOMEM;
240 } else
241 bits = smallbits;
243 #define getbits(name, x) \
244 if (u_ ## name) { \
245 error = copyin(u_ ## name, bits + ni * x, ni); \
246 if (error) \
247 goto done; \
248 } else \
249 memset(bits + ni * x, 0, ni);
250 getbits(in, 0);
251 getbits(ou, 1);
252 getbits(ex, 2);
253 #undef getbits
255 timo = 0;
256 if (ts && inittimeleft(ts, &sleepts) == -1) {
257 error = EINVAL;
258 goto done;
261 if (mask) {
262 sigminusset(&sigcantmask, mask);
263 mutex_enter(p->p_lock);
264 oldmask = l->l_sigmask;
265 l->l_sigmask = *mask;
266 mutex_exit(p->p_lock);
267 } else
268 oldmask = l->l_sigmask; /* XXXgcc */
270 sc = curcpu()->ci_data.cpu_selcpu;
271 lock = sc->sc_lock;
272 l->l_selcpu = sc;
273 SLIST_INIT(&l->l_selwait);
274 for (;;) {
276 * No need to lock. If this is overwritten by another
277 * value while scanning, we will retry below. We only
278 * need to see exact state from the descriptors that
279 * we are about to poll, and lock activity resulting
280 * from fo_poll is enough to provide an up to date value
281 * for new polling activity.
283 l->l_selflag = SEL_SCANNING;
284 ncoll = sc->sc_ncoll;
286 error = selscan(l, (fd_mask *)(bits + ni * 0),
287 (fd_mask *)(bits + ni * 3), nd, retval);
289 if (error || *retval)
290 break;
291 if (ts && (timo = gettimeleft(ts, &sleepts)) <= 0)
292 break;
293 mutex_spin_enter(lock);
294 if (l->l_selflag != SEL_SCANNING || sc->sc_ncoll != ncoll) {
295 mutex_spin_exit(lock);
296 continue;
298 l->l_selflag = SEL_BLOCKING;
299 l->l_kpriority = true;
300 sleepq_enter(&sc->sc_sleepq, l, lock);
301 sleepq_enqueue(&sc->sc_sleepq, sc, "select", &select_sobj);
302 error = sleepq_block(timo, true);
303 if (error != 0)
304 break;
306 selclear();
308 if (mask) {
309 mutex_enter(p->p_lock);
310 l->l_sigmask = oldmask;
311 mutex_exit(p->p_lock);
314 done:
315 /* select is not restarted after signals... */
316 if (error == ERESTART)
317 error = EINTR;
318 if (error == EWOULDBLOCK)
319 error = 0;
320 if (error == 0 && u_in != NULL)
321 error = copyout(bits + ni * 3, u_in, ni);
322 if (error == 0 && u_ou != NULL)
323 error = copyout(bits + ni * 4, u_ou, ni);
324 if (error == 0 && u_ex != NULL)
325 error = copyout(bits + ni * 5, u_ex, ni);
326 if (bits != smallbits)
327 kmem_free(bits, ni * 6);
328 return (error);
332 selscan(lwp_t *l, fd_mask *ibitp, fd_mask *obitp, int nfd,
333 register_t *retval)
335 static const int flag[3] = { POLLRDNORM | POLLHUP | POLLERR,
336 POLLWRNORM | POLLHUP | POLLERR,
337 POLLRDBAND };
338 int msk, i, j, fd, n;
339 fd_mask ibits, obits;
340 file_t *fp;
342 n = 0;
343 for (msk = 0; msk < 3; msk++) {
344 for (i = 0; i < nfd; i += NFDBITS) {
345 ibits = *ibitp++;
346 obits = 0;
347 while ((j = ffs(ibits)) && (fd = i + --j) < nfd) {
348 ibits &= ~(1 << j);
349 if ((fp = fd_getfile(fd)) == NULL)
350 return (EBADF);
351 if ((*fp->f_ops->fo_poll)(fp, flag[msk])) {
352 obits |= (1 << j);
353 n++;
355 fd_putfile(fd);
357 *obitp++ = obits;
360 *retval = n;
361 return (0);
365 * Poll system call.
368 sys_poll(struct lwp *l, const struct sys_poll_args *uap, register_t *retval)
370 /* {
371 syscallarg(struct pollfd *) fds;
372 syscallarg(u_int) nfds;
373 syscallarg(int) timeout;
374 } */
375 struct timespec ats, *ts = NULL;
377 if (SCARG(uap, timeout) != INFTIM) {
378 ats.tv_sec = SCARG(uap, timeout) / 1000;
379 ats.tv_nsec = (SCARG(uap, timeout) % 1000) * 1000000;
380 ts = &ats;
383 return pollcommon(l, retval, SCARG(uap, fds), SCARG(uap, nfds),
384 ts, NULL);
388 * Poll system call.
391 sys___pollts50(struct lwp *l, const struct sys___pollts50_args *uap,
392 register_t *retval)
394 /* {
395 syscallarg(struct pollfd *) fds;
396 syscallarg(u_int) nfds;
397 syscallarg(const struct timespec *) ts;
398 syscallarg(const sigset_t *) mask;
399 } */
400 struct timespec ats, *ts = NULL;
401 sigset_t amask, *mask = NULL;
402 int error;
404 if (SCARG(uap, ts)) {
405 error = copyin(SCARG(uap, ts), &ats, sizeof(ats));
406 if (error)
407 return error;
408 ts = &ats;
410 if (SCARG(uap, mask)) {
411 error = copyin(SCARG(uap, mask), &amask, sizeof(amask));
412 if (error)
413 return error;
414 mask = &amask;
417 return pollcommon(l, retval, SCARG(uap, fds), SCARG(uap, nfds),
418 ts, mask);
422 pollcommon(lwp_t *l, register_t *retval, struct pollfd *u_fds, u_int nfds,
423 struct timespec *ts, sigset_t *mask)
425 struct pollfd smallfds[32];
426 struct pollfd *fds;
427 proc_t * const p = l->l_proc;
428 sigset_t oldmask;
429 int ncoll, error, timo;
430 size_t ni;
431 struct timespec sleepts;
432 selcpu_t *sc;
433 kmutex_t *lock;
435 if (nfds > p->p_fd->fd_nfiles) {
436 /* forgiving; slightly wrong */
437 nfds = p->p_fd->fd_nfiles;
439 ni = nfds * sizeof(struct pollfd);
440 if (ni > sizeof(smallfds)) {
441 fds = kmem_alloc(ni, KM_SLEEP);
442 if (fds == NULL)
443 return ENOMEM;
444 } else
445 fds = smallfds;
447 error = copyin(u_fds, fds, ni);
448 if (error)
449 goto done;
451 timo = 0;
452 if (ts && inittimeleft(ts, &sleepts) == -1) {
453 error = EINVAL;
454 goto done;
457 if (mask) {
458 sigminusset(&sigcantmask, mask);
459 mutex_enter(p->p_lock);
460 oldmask = l->l_sigmask;
461 l->l_sigmask = *mask;
462 mutex_exit(p->p_lock);
463 } else
464 oldmask = l->l_sigmask; /* XXXgcc */
466 sc = curcpu()->ci_data.cpu_selcpu;
467 lock = sc->sc_lock;
468 l->l_selcpu = sc;
469 SLIST_INIT(&l->l_selwait);
470 for (;;) {
472 * No need to lock. If this is overwritten by another
473 * value while scanning, we will retry below. We only
474 * need to see exact state from the descriptors that
475 * we are about to poll, and lock activity resulting
476 * from fo_poll is enough to provide an up to date value
477 * for new polling activity.
479 ncoll = sc->sc_ncoll;
480 l->l_selflag = SEL_SCANNING;
482 error = pollscan(l, fds, nfds, retval);
484 if (error || *retval)
485 break;
486 if (ts && (timo = gettimeleft(ts, &sleepts)) <= 0)
487 break;
488 mutex_spin_enter(lock);
489 if (l->l_selflag != SEL_SCANNING || sc->sc_ncoll != ncoll) {
490 mutex_spin_exit(lock);
491 continue;
493 l->l_selflag = SEL_BLOCKING;
494 l->l_kpriority = true;
495 sleepq_enter(&sc->sc_sleepq, l, lock);
496 sleepq_enqueue(&sc->sc_sleepq, sc, "select", &select_sobj);
497 error = sleepq_block(timo, true);
498 if (error != 0)
499 break;
501 selclear();
503 if (mask) {
504 mutex_enter(p->p_lock);
505 l->l_sigmask = oldmask;
506 mutex_exit(p->p_lock);
508 done:
509 /* poll is not restarted after signals... */
510 if (error == ERESTART)
511 error = EINTR;
512 if (error == EWOULDBLOCK)
513 error = 0;
514 if (error == 0)
515 error = copyout(fds, u_fds, ni);
516 if (fds != smallfds)
517 kmem_free(fds, ni);
518 return (error);
522 pollscan(lwp_t *l, struct pollfd *fds, int nfd, register_t *retval)
524 int i, n;
525 file_t *fp;
527 n = 0;
528 for (i = 0; i < nfd; i++, fds++) {
529 if (fds->fd < 0) {
530 fds->revents = 0;
531 } else if ((fp = fd_getfile(fds->fd)) == NULL) {
532 fds->revents = POLLNVAL;
533 n++;
534 } else {
535 fds->revents = (*fp->f_ops->fo_poll)(fp,
536 fds->events | POLLERR | POLLHUP);
537 if (fds->revents != 0)
538 n++;
539 fd_putfile(fds->fd);
542 *retval = n;
543 return (0);
546 /*ARGSUSED*/
548 seltrue(dev_t dev, int events, lwp_t *l)
551 return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
555 * Record a select request. Concurrency issues:
557 * The caller holds the same lock across calls to selrecord() and
558 * selnotify(), so we don't need to consider a concurrent wakeup
559 * while in this routine.
561 * The only activity we need to guard against is selclear(), called by
562 * another thread that is exiting selcommon() or pollcommon().
563 * `sel_lwp' can only become non-NULL while the caller's lock is held,
564 * so it cannot become non-NULL due to a change made by another thread
565 * while we are in this routine. It can only become _NULL_ due to a
566 * call to selclear().
568 * If it is non-NULL and != selector there is the potential for
569 * selclear() to be called by another thread. If either of those
570 * conditions are true, we're not interested in touching the `named
571 * waiter' part of the selinfo record because we need to record a
572 * collision. Hence there is no need for additional locking in this
573 * routine.
575 void
576 selrecord(lwp_t *selector, struct selinfo *sip)
578 selcpu_t *sc;
579 lwp_t *other;
581 KASSERT(selector == curlwp);
583 sc = selector->l_selcpu;
584 other = sip->sel_lwp;
586 if (other == selector) {
587 /* `selector' has already claimed it. */
588 KASSERT(sip->sel_cpu = sc);
589 } else if (other == NULL) {
591 * First named waiter, although there may be unnamed
592 * waiters (collisions). Issue a memory barrier to
593 * ensure that we access sel_lwp (above) before other
594 * fields - this guards against a call to selclear().
596 membar_enter();
597 sip->sel_lwp = selector;
598 SLIST_INSERT_HEAD(&selector->l_selwait, sip, sel_chain);
599 /* Replace selinfo's lock with our chosen CPU's lock. */
600 sip->sel_cpu = sc;
601 } else {
602 /* Multiple waiters: record a collision. */
603 sip->sel_collision |= sc->sc_mask;
604 KASSERT(sip->sel_cpu != NULL);
609 * Do a wakeup when a selectable event occurs. Concurrency issues:
611 * As per selrecord(), the caller's object lock is held. If there
612 * is a named waiter, we must acquire the associated selcpu's lock
613 * in order to synchronize with selclear() and pollers going to sleep
614 * in selcommon() and/or pollcommon().
616 * sip->sel_cpu cannot change at this point, as it is only changed
617 * in selrecord(), and concurrent calls to selrecord() are locked
618 * out by the caller.
620 void
621 selnotify(struct selinfo *sip, int events, long knhint)
623 selcpu_t *sc;
624 uint32_t mask;
625 int index, oflag, swapin;
626 lwp_t *l;
627 kmutex_t *lock;
629 KNOTE(&sip->sel_klist, knhint);
631 if (sip->sel_lwp != NULL) {
632 /* One named LWP is waiting. */
633 swapin = 0;
634 sc = sip->sel_cpu;
635 lock = sc->sc_lock;
636 mutex_spin_enter(lock);
637 /* Still there? */
638 if (sip->sel_lwp != NULL) {
639 l = sip->sel_lwp;
641 * If thread is sleeping, wake it up. If it's not
642 * yet asleep, it will notice the change in state
643 * and will re-poll the descriptors.
645 oflag = l->l_selflag;
646 l->l_selflag = SEL_RESET;
647 if (oflag == SEL_BLOCKING && l->l_mutex == lock) {
648 KASSERT(l->l_wchan == sc);
649 swapin = sleepq_unsleep(l, false);
652 mutex_spin_exit(lock);
653 if (swapin)
654 uvm_kick_scheduler();
657 if ((mask = sip->sel_collision) != 0) {
659 * There was a collision (multiple waiters): we must
660 * inform all potentially interested waiters.
662 sip->sel_collision = 0;
663 do {
664 index = ffs(mask) - 1;
665 mask &= ~(1 << index);
666 sc = cpu_lookup(index)->ci_data.cpu_selcpu;
667 lock = sc->sc_lock;
668 mutex_spin_enter(lock);
669 sc->sc_ncoll++;
670 sleepq_wake(&sc->sc_sleepq, sc, (u_int)-1, lock);
671 } while (__predict_false(mask != 0));
676 * Remove an LWP from all objects that it is waiting for. Concurrency
677 * issues:
679 * The object owner's (e.g. device driver) lock is not held here. Calls
680 * can be made to selrecord() and we do not synchronize against those
681 * directly using locks. However, we use `sel_lwp' to lock out changes.
682 * Before clearing it we must use memory barriers to ensure that we can
683 * safely traverse the list of selinfo records.
685 static void
686 selclear(void)
688 struct selinfo *sip, *next;
689 selcpu_t *sc;
690 lwp_t *l;
691 kmutex_t *lock;
693 l = curlwp;
694 sc = l->l_selcpu;
695 lock = sc->sc_lock;
697 mutex_spin_enter(lock);
698 for (sip = SLIST_FIRST(&l->l_selwait); sip != NULL; sip = next) {
699 KASSERT(sip->sel_lwp == l);
700 KASSERT(sip->sel_cpu == l->l_selcpu);
702 * Read link to next selinfo record, if any.
703 * It's no longer safe to touch `sip' after clearing
704 * `sel_lwp', so ensure that the read of `sel_chain'
705 * completes before the clearing of sel_lwp becomes
706 * globally visible.
708 next = SLIST_NEXT(sip, sel_chain);
709 membar_exit();
710 /* Release the record for another named waiter to use. */
711 sip->sel_lwp = NULL;
713 mutex_spin_exit(lock);
717 * Initialize the select/poll system calls. Called once for each
718 * CPU in the system, as they are attached.
720 void
721 selsysinit(struct cpu_info *ci)
723 selcpu_t *sc;
725 sc = kmem_alloc(roundup2(sizeof(selcpu_t), coherency_unit) +
726 coherency_unit, KM_SLEEP);
727 sc = (void *)roundup2((uintptr_t)sc, coherency_unit);
728 sc->sc_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_SCHED);
729 sleepq_init(&sc->sc_sleepq);
730 sc->sc_ncoll = 0;
731 sc->sc_mask = (1 << cpu_index(ci));
732 ci->ci_data.cpu_selcpu = sc;
736 * Initialize a selinfo record.
738 void
739 selinit(struct selinfo *sip)
742 memset(sip, 0, sizeof(*sip));
746 * Destroy a selinfo record. The owning object must not gain new
747 * references while this is in progress: all activity on the record
748 * must be stopped.
750 * Concurrency issues: we only need guard against a call to selclear()
751 * by a thread exiting selcommon() and/or pollcommon(). The caller has
752 * prevented further references being made to the selinfo record via
753 * selrecord(), and it won't call selwakeup() again.
755 void
756 seldestroy(struct selinfo *sip)
758 selcpu_t *sc;
759 kmutex_t *lock;
760 lwp_t *l;
762 if (sip->sel_lwp == NULL)
763 return;
766 * Lock out selclear(). The selcpu pointer can't change while
767 * we are here since it is only ever changed in selrecord(),
768 * and that will not be entered again for this record because
769 * it is dying.
771 KASSERT(sip->sel_cpu != NULL);
772 sc = sip->sel_cpu;
773 lock = sc->sc_lock;
774 mutex_spin_enter(lock);
775 if ((l = sip->sel_lwp) != NULL) {
777 * This should rarely happen, so although SLIST_REMOVE()
778 * is slow, using it here is not a problem.
780 KASSERT(l->l_selcpu == sc);
781 SLIST_REMOVE(&l->l_selwait, sip, selinfo, sel_chain);
782 sip->sel_lwp = NULL;
784 mutex_spin_exit(lock);
788 pollsock(struct socket *so, const struct timespec *tsp, int events)
790 int ncoll, error, timo;
791 struct timespec sleepts, ts;
792 selcpu_t *sc;
793 lwp_t *l;
794 kmutex_t *lock;
796 timo = 0;
797 if (tsp != NULL) {
798 ts = *tsp;
799 if (inittimeleft(&ts, &sleepts) == -1)
800 return EINVAL;
803 l = curlwp;
804 sc = l->l_cpu->ci_data.cpu_selcpu;
805 lock = sc->sc_lock;
806 l->l_selcpu = sc;
807 SLIST_INIT(&l->l_selwait);
808 error = 0;
809 for (;;) {
811 * No need to lock. If this is overwritten by another
812 * value while scanning, we will retry below. We only
813 * need to see exact state from the descriptors that
814 * we are about to poll, and lock activity resulting
815 * from fo_poll is enough to provide an up to date value
816 * for new polling activity.
818 ncoll = sc->sc_ncoll;
819 l->l_selflag = SEL_SCANNING;
820 if (sopoll(so, events) != 0)
821 break;
822 if (tsp && (timo = gettimeleft(&ts, &sleepts)) <= 0)
823 break;
824 mutex_spin_enter(lock);
825 if (l->l_selflag != SEL_SCANNING || sc->sc_ncoll != ncoll) {
826 mutex_spin_exit(lock);
827 continue;
829 l->l_selflag = SEL_BLOCKING;
830 sleepq_enter(&sc->sc_sleepq, l, lock);
831 sleepq_enqueue(&sc->sc_sleepq, sc, "pollsock", &select_sobj);
832 error = sleepq_block(timo, true);
833 if (error != 0)
834 break;
836 selclear();
837 /* poll is not restarted after signals... */
838 if (error == ERESTART)
839 error = EINTR;
840 if (error == EWOULDBLOCK)
841 error = 0;
842 return (error);