6253 F_GETLK doesn't always return lock owner
[illumos-gate.git] / usr / src / uts / common / syscall / poll.c
blob7f375299412136640d3e76834f35c6b3446b126d
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
23 * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
27 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
28 /* All Rights Reserved */
31 * Copyright (c) 2012 by Delphix. All rights reserved.
35 * Portions of this source code were derived from Berkeley 4.3 BSD
36 * under license from the Regents of the University of California.
39 #include <sys/param.h>
40 #include <sys/isa_defs.h>
41 #include <sys/types.h>
42 #include <sys/sysmacros.h>
43 #include <sys/user.h>
44 #include <sys/systm.h>
45 #include <sys/errno.h>
46 #include <sys/time.h>
47 #include <sys/vnode.h>
48 #include <sys/file.h>
49 #include <sys/mode.h>
50 #include <sys/proc.h>
51 #include <sys/uio.h>
52 #include <sys/poll_impl.h>
53 #include <sys/kmem.h>
54 #include <sys/cmn_err.h>
55 #include <sys/debug.h>
56 #include <sys/bitmap.h>
57 #include <sys/kstat.h>
58 #include <sys/rctl.h>
59 #include <sys/port_impl.h>
60 #include <sys/schedctl.h>
61 #include <sys/cpu.h>
63 #define NPHLOCKS 64 /* Number of locks; must be power of 2 */
64 #define PHLOCKADDR(php) &plocks[(((uintptr_t)(php)) >> 8) & (NPHLOCKS - 1)]
65 #define PHLOCK(php) PHLOCKADDR(php).pp_lock
66 #define PH_ENTER(php) mutex_enter(PHLOCK(php))
67 #define PH_EXIT(php) mutex_exit(PHLOCK(php))
68 #define VALID_POLL_EVENTS (POLLIN | POLLPRI | POLLOUT | POLLRDNORM \
69 | POLLRDBAND | POLLWRBAND | POLLHUP | POLLERR | POLLNVAL)
72 * global counters to collect some stats
74 static struct {
75 kstat_named_t polllistmiss; /* failed to find a cached poll list */
76 kstat_named_t pollcachehit; /* list matched 100% w/ cached one */
77 kstat_named_t pollcachephit; /* list matched < 100% w/ cached one */
78 kstat_named_t pollcachemiss; /* every list entry is dif from cache */
79 } pollstats = {
80 { "polllistmiss", KSTAT_DATA_UINT64 },
81 { "pollcachehit", KSTAT_DATA_UINT64 },
82 { "pollcachephit", KSTAT_DATA_UINT64 },
83 { "pollcachemiss", KSTAT_DATA_UINT64 }
86 kstat_named_t *pollstats_ptr = (kstat_named_t *)&pollstats;
87 uint_t pollstats_ndata = sizeof (pollstats) / sizeof (kstat_named_t);
89 struct pplock {
90 kmutex_t pp_lock;
91 short pp_flag;
92 kcondvar_t pp_wait_cv;
93 int32_t pp_pad; /* to a nice round 16 bytes */
96 static struct pplock plocks[NPHLOCKS]; /* Hash array of pollhead locks */
98 #ifdef DEBUG
99 static int pollchecksanity(pollstate_t *, nfds_t);
100 static int pollcheckxref(pollstate_t *, int);
101 static void pollcheckphlist(void);
102 static int pollcheckrevents(pollstate_t *, int, int, int);
103 static void checkpolldat(pollstate_t *);
104 #endif /* DEBUG */
105 static int plist_chkdupfd(file_t *, polldat_t *, pollstate_t *, pollfd_t *, int,
106 int *);
109 * Data structure overview:
110 * The per-thread poll state consists of
111 * one pollstate_t
112 * one pollcache_t
113 * one bitmap with one event bit per fd
114 * a (two-dimensional) hashed array of polldat_t structures - one entry
115 * per fd
117 * This conglomerate of data structures interact with
118 * the pollhead which is used by VOP_POLL and pollwakeup
119 * (protected by the PHLOCK, cached array of plocks), and
120 * the fpollinfo list hanging off the fi_list which is used to notify
121 * poll when a cached fd is closed. This is protected by uf_lock.
123 * Invariants:
124 * pd_php (pollhead pointer) is set iff (if and only if) the polldat
125 * is on that pollhead. This is modified atomically under pc_lock.
127 * pd_fp (file_t pointer) is set iff the thread is on the fpollinfo
128 * list for that open file.
129 * This is modified atomically under pc_lock.
131 * pd_count is the sum (over all values of i) of pd_ref[i].xf_refcnt.
132 * Iff pd_ref[i].xf_refcnt >= 1 then
133 * ps_pcacheset[i].pcs_pollfd[pd_ref[i].xf_position].fd == pd_fd
134 * Iff pd_ref[i].xf_refcnt > 1 then
135 * In ps_pcacheset[i].pcs_pollfd between index
136 * pd_ref[i].xf_position] and the end of the list
137 * there are xf_refcnt entries with .fd == pd_fd
139 * Locking design:
140 * Whenever possible the design relies on the fact that the poll cache state
141 * is per thread thus for both poll and exit it is self-synchronizing.
142 * Thus the key interactions where other threads access the state are:
143 * pollwakeup (and polltime), and
144 * close cleaning up the cached references to an open file
146 * The two key locks in poll proper is ps_lock and pc_lock.
148 * The ps_lock is used for synchronization between poll, (lwp_)exit and close
149 * to ensure that modifications to pollcacheset structure are serialized.
150 * This lock is held through most of poll() except where poll sleeps
151 * since there is little need to handle closes concurrently with the execution
152 * of poll.
153 * The pc_lock protects most of the fields in pollcache structure and polldat
154 * structures (which are accessed by poll, pollwakeup, and polltime)
155 * with the exception of fields that are only modified when only one thread
156 * can access this per-thread state.
157 * Those exceptions occur in poll when first allocating the per-thread state,
158 * when poll grows the number of polldat (never shrinks), and when
159 * exit/pollcleanup has ensured that there are no references from either
160 * pollheads or fpollinfo to the threads poll state.
162 * Poll(2) system call is the only path which ps_lock and pc_lock are both
163 * held, in that order. It needs ps_lock to synchronize with close and
164 * lwp_exit; and pc_lock with pollwakeup.
166 * The locking interaction between pc_lock and PHLOCK take into account
167 * that poll acquires these locks in the order of pc_lock and then PHLOCK
168 * while pollwakeup does it in the reverse order. Thus pollwakeup implements
169 * deadlock avoidance by dropping the locks and reacquiring them in the
170 * reverse order. For this to work pollwakeup needs to prevent the thread
171 * from exiting and freeing all of the poll related state. Thus is done
172 * using
173 * the pc_no_exit lock
174 * the pc_busy counter
175 * the pc_busy_cv condition variable
177 * The locking interaction between pc_lock and uf_lock has similar
178 * issues. Poll holds ps_lock and/or pc_lock across calls to getf/releasef
179 * which acquire uf_lock. The poll cleanup in close needs to hold uf_lock
180 * to prevent poll or exit from doing a delfpollinfo after which the thread
181 * might exit. But the cleanup needs to acquire pc_lock when modifying
182 * the poll cache state. The solution is to use pc_busy and do the close
183 * cleanup in two phases:
184 * First close calls pollblockexit which increments pc_busy.
185 * This prevents the per-thread poll related state from being freed.
186 * Then close drops uf_lock and calls pollcacheclean.
187 * This routine can then acquire pc_lock and remove any references
188 * to the closing fd (as well as recording that it has been closed
189 * so that a POLLNVAL can be generated even if the fd is reused before
190 * poll has been woken up and checked getf() again).
192 * When removing a polled fd from poll cache, the fd is always removed
193 * from pollhead list first and then from fpollinfo list, i.e.,
194 * pollhead_delete() is called before delfpollinfo().
197 * Locking hierarchy:
198 * pc_no_exit is a leaf level lock.
199 * ps_lock is held when acquiring pc_lock (except when pollwakeup
200 * acquires pc_lock).
201 * pc_lock might be held when acquiring PHLOCK (pollhead_insert/
202 * pollhead_delete)
203 * pc_lock is always held (but this is not required)
204 * when acquiring PHLOCK (in polladd/pollhead_delete and pollwakeup called
205 * from pcache_clean_entry).
206 * pc_lock is held across addfpollinfo/delfpollinfo which acquire
207 * uf_lock.
208 * pc_lock is held across getf/releasef which acquire uf_lock.
209 * ps_lock might be held across getf/releasef which acquire uf_lock.
210 * pollwakeup tries to acquire pc_lock while holding PHLOCK
211 * but drops the locks and reacquire them in reverse order to avoid
212 * deadlock.
214 * Note also that there is deadlock avoidance support for VOP_POLL routines
215 * and pollwakeup involving a file system or driver lock.
216 * See below.
220 * Deadlock avoidance support for VOP_POLL() routines. This is
221 * sometimes necessary to prevent deadlock between polling threads
222 * (which hold poll locks on entry to xx_poll(), then acquire foo)
223 * and pollwakeup() threads (which hold foo, then acquire poll locks).
225 * pollunlock(void) releases whatever poll locks the current thread holds,
226 * returning a cookie for use by pollrelock();
228 * pollrelock(cookie) reacquires previously dropped poll locks;
230 * polllock(php, mutex) does the common case: pollunlock(),
231 * acquire the problematic mutex, pollrelock().
234 pollunlock(void)
236 pollcache_t *pcp;
237 int lockstate = 0;
240 * t_pollcache is set by /dev/poll and event ports (port_fd.c).
241 * If the pollrelock/pollunlock is called as a result of poll(2),
242 * the t_pollcache should be NULL.
244 if (curthread->t_pollcache == NULL)
245 pcp = curthread->t_pollstate->ps_pcache;
246 else
247 pcp = curthread->t_pollcache;
249 if (mutex_owned(&pcp->pc_lock)) {
250 lockstate = 1;
251 mutex_exit(&pcp->pc_lock);
253 return (lockstate);
256 void
257 pollrelock(int lockstate)
259 pollcache_t *pcp;
262 * t_pollcache is set by /dev/poll and event ports (port_fd.c).
263 * If the pollrelock/pollunlock is called as a result of poll(2),
264 * the t_pollcache should be NULL.
266 if (curthread->t_pollcache == NULL)
267 pcp = curthread->t_pollstate->ps_pcache;
268 else
269 pcp = curthread->t_pollcache;
271 if (lockstate > 0)
272 mutex_enter(&pcp->pc_lock);
275 /* ARGSUSED */
276 void
277 polllock(pollhead_t *php, kmutex_t *lp)
279 if (!mutex_tryenter(lp)) {
280 int lockstate = pollunlock();
281 mutex_enter(lp);
282 pollrelock(lockstate);
286 static int
287 poll_common(pollfd_t *fds, nfds_t nfds, timespec_t *tsp, k_sigset_t *ksetp)
289 kthread_t *t = curthread;
290 klwp_t *lwp = ttolwp(t);
291 proc_t *p = ttoproc(t);
292 int fdcnt = 0;
293 int i;
294 hrtime_t deadline; /* hrtime value when we want to return */
295 pollfd_t *pollfdp;
296 pollstate_t *ps;
297 pollcache_t *pcp;
298 int error = 0;
299 nfds_t old_nfds;
300 int cacheindex = 0; /* which cache set is used */
303 * Determine the precise future time of the requested timeout, if any.
305 if (tsp == NULL) {
306 deadline = -1;
307 } else if (tsp->tv_sec == 0 && tsp->tv_nsec == 0) {
308 deadline = 0;
309 } else {
310 /* They must wait at least a tick. */
311 deadline = ((hrtime_t)tsp->tv_sec * NANOSEC) + tsp->tv_nsec;
312 deadline = MAX(deadline, nsec_per_tick);
313 deadline += gethrtime();
317 * Reset our signal mask, if requested.
319 if (ksetp != NULL) {
320 mutex_enter(&p->p_lock);
321 schedctl_finish_sigblock(t);
322 lwp->lwp_sigoldmask = t->t_hold;
323 t->t_hold = *ksetp;
324 t->t_flag |= T_TOMASK;
326 * Call cv_reltimedwait_sig() just to check for signals.
327 * We will return immediately with either 0 or -1.
329 if (!cv_reltimedwait_sig(&t->t_delay_cv, &p->p_lock, 0,
330 TR_CLOCK_TICK)) {
331 mutex_exit(&p->p_lock);
332 error = EINTR;
333 goto pollout;
335 mutex_exit(&p->p_lock);
339 * Check to see if this guy just wants to use poll() as a timeout.
340 * If yes then bypass all the other stuff and make him sleep.
342 if (nfds == 0) {
344 * Sleep until we have passed the requested future
345 * time or until interrupted by a signal.
346 * Do not check for signals if we do not want to wait.
348 if (deadline != 0) {
349 mutex_enter(&t->t_delay_lock);
350 while ((error = cv_timedwait_sig_hrtime(&t->t_delay_cv,
351 &t->t_delay_lock, deadline)) > 0)
352 continue;
353 mutex_exit(&t->t_delay_lock);
354 error = (error == 0) ? EINTR : 0;
356 goto pollout;
359 if (nfds > p->p_fno_ctl) {
360 mutex_enter(&p->p_lock);
361 (void) rctl_action(rctlproc_legacy[RLIMIT_NOFILE],
362 p->p_rctls, p, RCA_SAFE);
363 mutex_exit(&p->p_lock);
364 error = EINVAL;
365 goto pollout;
369 * Need to allocate memory for pollstate before anything because
370 * the mutex and cv are created in this space
372 if ((ps = t->t_pollstate) == NULL) {
373 t->t_pollstate = pollstate_create();
374 ps = t->t_pollstate;
377 if (ps->ps_pcache == NULL)
378 ps->ps_pcache = pcache_alloc();
379 pcp = ps->ps_pcache;
382 * NOTE: for performance, buffers are saved across poll() calls.
383 * The theory is that if a process polls heavily, it tends to poll
384 * on the same set of descriptors. Therefore, we only reallocate
385 * buffers when nfds changes. There is no hysteresis control,
386 * because there is no data to suggest that this is necessary;
387 * the penalty of reallocating is not *that* great in any event.
389 old_nfds = ps->ps_nfds;
390 if (nfds != old_nfds) {
392 kmem_free(ps->ps_pollfd, old_nfds * sizeof (pollfd_t));
393 pollfdp = kmem_alloc(nfds * sizeof (pollfd_t), KM_SLEEP);
394 ps->ps_pollfd = pollfdp;
395 ps->ps_nfds = nfds;
398 pollfdp = ps->ps_pollfd;
399 if (copyin(fds, pollfdp, nfds * sizeof (pollfd_t))) {
400 error = EFAULT;
401 goto pollout;
404 if (fds == NULL) {
406 * If the process has page 0 mapped, then the copyin() above
407 * will succeed even if fds is NULL. However, our cached
408 * poll lists are keyed by the address of the passed-in fds
409 * structure, and we use the value NULL to indicate an unused
410 * poll cache list entry. As such, we elect not to support
411 * NULL as a valid (user) memory address and fail the poll()
412 * call.
414 error = EINVAL;
415 goto pollout;
419 * If this thread polls for the first time, allocate ALL poll
420 * cache data structures and cache the poll fd list. This
421 * allocation is delayed till now because lwp's polling 0 fd
422 * (i.e. using poll as timeout()) don't need this memory.
424 mutex_enter(&ps->ps_lock);
425 pcp = ps->ps_pcache;
426 ASSERT(pcp != NULL);
427 if (pcp->pc_bitmap == NULL) {
428 pcache_create(pcp, nfds);
430 * poll and cache this poll fd list in ps_pcacheset[0].
432 error = pcacheset_cache_list(ps, fds, &fdcnt, cacheindex);
433 if (fdcnt || error) {
434 mutex_exit(&ps->ps_lock);
435 goto pollout;
437 } else {
438 pollcacheset_t *pcset = ps->ps_pcacheset;
441 * Not first time polling. Select a cached poll list by
442 * matching user pollfd list buffer address.
444 for (cacheindex = 0; cacheindex < ps->ps_nsets; cacheindex++) {
445 if (pcset[cacheindex].pcs_usradr == (uintptr_t)fds) {
446 if ((++pcset[cacheindex].pcs_count) == 0) {
448 * counter is wrapping around.
450 pcacheset_reset_count(ps, cacheindex);
453 * examine and resolve possible
454 * difference of the current poll
455 * list and previously cached one.
456 * If there is an error during resolve(),
457 * the callee will guarantee the consistency
458 * of cached poll list and cache content.
460 error = pcacheset_resolve(ps, nfds, &fdcnt,
461 cacheindex);
462 if (error) {
463 mutex_exit(&ps->ps_lock);
464 goto pollout;
466 break;
470 * Note that pcs_usradr field of an used entry won't be
471 * NULL because it stores the address of passed-in fds,
472 * and NULL fds will not be cached (Then it is either
473 * the special timeout case when nfds is 0 or it returns
474 * failure directly).
476 if (pcset[cacheindex].pcs_usradr == NULL) {
478 * found an unused entry. Use it to cache
479 * this poll list.
481 error = pcacheset_cache_list(ps, fds, &fdcnt,
482 cacheindex);
483 if (fdcnt || error) {
484 mutex_exit(&ps->ps_lock);
485 goto pollout;
487 break;
490 if (cacheindex == ps->ps_nsets) {
492 * We failed to find a matching cached poll fd list.
493 * replace an old list.
495 pollstats.polllistmiss.value.ui64++;
496 cacheindex = pcacheset_replace(ps);
497 ASSERT(cacheindex < ps->ps_nsets);
498 pcset[cacheindex].pcs_usradr = (uintptr_t)fds;
499 error = pcacheset_resolve(ps, nfds, &fdcnt, cacheindex);
500 if (error) {
501 mutex_exit(&ps->ps_lock);
502 goto pollout;
508 * Always scan the bitmap with the lock on the pollcache held.
509 * This is to make sure that a wakeup does not come undetected.
510 * If the lock is not held, a pollwakeup could have come for an
511 * fd we already checked but before this thread sleeps, in which
512 * case the wakeup is missed. Now we hold the pcache lock and
513 * check the bitmap again. This will prevent wakeup from happening
514 * while we hold pcache lock since pollwakeup() will also lock
515 * the pcache before updating poll bitmap.
517 mutex_enter(&pcp->pc_lock);
518 for (;;) {
519 pcp->pc_flag = 0;
520 error = pcache_poll(pollfdp, ps, nfds, &fdcnt, cacheindex);
521 if (fdcnt || error) {
522 mutex_exit(&pcp->pc_lock);
523 mutex_exit(&ps->ps_lock);
524 break;
528 * If T_POLLWAKE is set, a pollwakeup() was performed on
529 * one of the file descriptors. This can happen only if
530 * one of the VOP_POLL() functions dropped pcp->pc_lock.
531 * The only current cases of this is in procfs (prpoll())
532 * and STREAMS (strpoll()).
534 if (pcp->pc_flag & T_POLLWAKE)
535 continue;
538 * If you get here, the poll of fds was unsuccessful.
539 * Wait until some fd becomes readable, writable, or gets
540 * an exception, or until a signal or a timeout occurs.
541 * Do not check for signals if we have a zero timeout.
543 mutex_exit(&ps->ps_lock);
544 if (deadline == 0) {
545 error = -1;
546 } else {
547 error = cv_timedwait_sig_hrtime(&pcp->pc_cv,
548 &pcp->pc_lock, deadline);
550 mutex_exit(&pcp->pc_lock);
552 * If we have received a signal or timed out
553 * then break out and return.
555 if (error <= 0) {
556 error = (error == 0) ? EINTR : 0;
557 break;
560 * We have not received a signal or timed out.
561 * Continue around and poll fds again.
563 mutex_enter(&ps->ps_lock);
564 mutex_enter(&pcp->pc_lock);
567 pollout:
569 * If we changed the signal mask but we received
570 * no signal then restore the signal mask.
571 * Otherwise psig() will deal with the signal mask.
573 if (ksetp != NULL) {
574 mutex_enter(&p->p_lock);
575 if (lwp->lwp_cursig == 0) {
576 t->t_hold = lwp->lwp_sigoldmask;
577 t->t_flag &= ~T_TOMASK;
579 mutex_exit(&p->p_lock);
582 if (error)
583 return (set_errno(error));
586 * Copy out the events and return the fdcnt to the user.
588 if (nfds != 0 &&
589 copyout(pollfdp, fds, nfds * sizeof (pollfd_t)))
590 return (set_errno(EFAULT));
592 #ifdef DEBUG
594 * Another sanity check:
596 if (fdcnt) {
597 int reventcnt = 0;
599 for (i = 0; i < nfds; i++) {
600 if (pollfdp[i].fd < 0) {
601 ASSERT(pollfdp[i].revents == 0);
602 continue;
604 if (pollfdp[i].revents) {
605 reventcnt++;
608 ASSERT(fdcnt == reventcnt);
609 } else {
610 for (i = 0; i < nfds; i++) {
611 ASSERT(pollfdp[i].revents == 0);
614 #endif /* DEBUG */
616 return (fdcnt);
620 * This is the system call trap that poll(),
621 * select() and pselect() are built upon.
622 * It is a private interface between libc and the kernel.
625 pollsys(pollfd_t *fds, nfds_t nfds, timespec_t *timeoutp, sigset_t *setp)
627 timespec_t ts;
628 timespec_t *tsp;
629 sigset_t set;
630 k_sigset_t kset;
631 k_sigset_t *ksetp;
632 model_t datamodel = get_udatamodel();
634 if (timeoutp == NULL)
635 tsp = NULL;
636 else {
637 if (datamodel == DATAMODEL_NATIVE) {
638 if (copyin(timeoutp, &ts, sizeof (ts)))
639 return (set_errno(EFAULT));
640 } else {
641 timespec32_t ts32;
643 if (copyin(timeoutp, &ts32, sizeof (ts32)))
644 return (set_errno(EFAULT));
645 TIMESPEC32_TO_TIMESPEC(&ts, &ts32)
648 if (itimerspecfix(&ts))
649 return (set_errno(EINVAL));
650 tsp = &ts;
653 if (setp == NULL)
654 ksetp = NULL;
655 else {
656 if (copyin(setp, &set, sizeof (set)))
657 return (set_errno(EFAULT));
658 sigutok(&set, &kset);
659 ksetp = &kset;
662 return (poll_common(fds, nfds, tsp, ksetp));
666 * Clean up any state left around by poll(2). Called when a thread exits.
668 void
669 pollcleanup()
671 pollstate_t *ps = curthread->t_pollstate;
672 pollcache_t *pcp;
674 if (ps == NULL)
675 return;
676 pcp = ps->ps_pcache;
678 * free up all cached poll fds
680 if (pcp == NULL) {
681 /* this pollstate is used by /dev/poll */
682 goto pollcleanout;
685 if (pcp->pc_bitmap != NULL) {
686 ASSERT(MUTEX_NOT_HELD(&ps->ps_lock));
688 * a close lwp can race with us when cleaning up a polldat
689 * entry. We hold the ps_lock when cleaning hash table.
690 * Since this pollcache is going away anyway, there is no
691 * need to hold the pc_lock.
693 mutex_enter(&ps->ps_lock);
694 pcache_clean(pcp);
695 mutex_exit(&ps->ps_lock);
696 #ifdef DEBUG
698 * At this point, all fds cached by this lwp should be
699 * cleaned up. There should be no fd in fi_list still
700 * reference this thread.
702 checkfpollinfo(); /* sanity check */
703 pollcheckphlist(); /* sanity check */
704 #endif /* DEBUG */
707 * Be sure no one is referencing thread before exiting
709 mutex_enter(&pcp->pc_no_exit);
710 ASSERT(pcp->pc_busy >= 0);
711 while (pcp->pc_busy > 0)
712 cv_wait(&pcp->pc_busy_cv, &pcp->pc_no_exit);
713 mutex_exit(&pcp->pc_no_exit);
714 pollcleanout:
715 pollstate_destroy(ps);
716 curthread->t_pollstate = NULL;
720 * pollwakeup() - poke threads waiting in poll() for some event
721 * on a particular object.
723 * The threads hanging off of the specified pollhead structure are scanned.
724 * If their event mask matches the specified event(s), then pollnotify() is
725 * called to poke the thread.
727 * Multiple events may be specified. When POLLHUP or POLLERR are specified,
728 * all waiting threads are poked.
730 * It is important that pollnotify() not drop the lock protecting the list
731 * of threads.
733 void
734 pollwakeup(pollhead_t *php, short events_arg)
736 polldat_t *pdp;
737 int events = (ushort_t)events_arg;
738 struct plist {
739 port_t *pp;
740 int pevents;
741 struct plist *next;
743 struct plist *plhead = NULL, *pltail = NULL;
745 retry:
746 PH_ENTER(php);
748 for (pdp = php->ph_list; pdp; pdp = pdp->pd_next) {
749 if ((pdp->pd_events & events) ||
750 (events & (POLLHUP | POLLERR))) {
752 pollcache_t *pcp;
754 if (pdp->pd_portev != NULL) {
755 port_kevent_t *pkevp = pdp->pd_portev;
757 * Object (fd) is associated with an event port,
758 * => send event notification to the port.
760 ASSERT(pkevp->portkev_source == PORT_SOURCE_FD);
761 mutex_enter(&pkevp->portkev_lock);
762 if (pkevp->portkev_flags & PORT_KEV_VALID) {
763 int pevents;
765 pkevp->portkev_flags &= ~PORT_KEV_VALID;
766 pkevp->portkev_events |= events &
767 (pdp->pd_events | POLLHUP |
768 POLLERR);
770 * portkev_lock mutex will be released
771 * by port_send_event().
773 port_send_event(pkevp);
776 * If we have some thread polling the
777 * port's fd, add it to the list. They
778 * will be notified later.
779 * The port_pollwkup() will flag the
780 * port_t so that it will not disappear
781 * till port_pollwkdone() is called.
783 pevents =
784 port_pollwkup(pkevp->portkev_port);
785 if (pevents) {
786 struct plist *t;
787 t = kmem_zalloc(
788 sizeof (struct plist),
789 KM_SLEEP);
790 t->pp = pkevp->portkev_port;
791 t->pevents = pevents;
792 if (plhead == NULL) {
793 plhead = t;
794 } else {
795 pltail->next = t;
797 pltail = t;
799 } else {
800 mutex_exit(&pkevp->portkev_lock);
802 continue;
805 pcp = pdp->pd_pcache;
808 * Try to grab the lock for this thread. If
809 * we don't get it then we may deadlock so
810 * back out and restart all over again. Note
811 * that the failure rate is very very low.
813 if (mutex_tryenter(&pcp->pc_lock)) {
814 pollnotify(pcp, pdp->pd_fd);
815 mutex_exit(&pcp->pc_lock);
816 } else {
818 * We are here because:
819 * 1) This thread has been woke up
820 * and is trying to get out of poll().
821 * 2) Some other thread is also here
822 * but with a different pollhead lock.
824 * So, we need to drop the lock on pollhead
825 * because of (1) but we want to prevent
826 * that thread from doing lwp_exit() or
827 * devpoll close. We want to ensure that
828 * the pollcache pointer is still invalid.
830 * Solution: Grab the pcp->pc_no_exit lock,
831 * increment the pc_busy counter, drop every
832 * lock in sight. Get out of the way and wait
833 * for type (2) threads to finish.
836 mutex_enter(&pcp->pc_no_exit);
837 pcp->pc_busy++; /* prevents exit()'s */
838 mutex_exit(&pcp->pc_no_exit);
840 PH_EXIT(php);
841 mutex_enter(&pcp->pc_lock);
842 mutex_exit(&pcp->pc_lock);
843 mutex_enter(&pcp->pc_no_exit);
844 pcp->pc_busy--;
845 if (pcp->pc_busy == 0) {
847 * Wakeup the thread waiting in
848 * thread_exit().
850 cv_signal(&pcp->pc_busy_cv);
852 mutex_exit(&pcp->pc_no_exit);
853 goto retry;
860 * Event ports - If this php is of the port on the list,
861 * call port_pollwkdone() to release it. The port_pollwkdone()
862 * needs to be called before dropping the PH lock so that any new
863 * thread attempting to poll this port are blocked. There can be
864 * only one thread here in pollwakeup notifying this port's fd.
866 if (plhead != NULL && &plhead->pp->port_pollhd == php) {
867 struct plist *t;
868 port_pollwkdone(plhead->pp);
869 t = plhead;
870 plhead = plhead->next;
871 kmem_free(t, sizeof (struct plist));
873 PH_EXIT(php);
876 * Event ports - Notify threads polling the event port's fd.
877 * This is normally done in port_send_event() where it calls
878 * pollwakeup() on the port. But, for PORT_SOURCE_FD source alone,
879 * we do it here in pollwakeup() to avoid a recursive call.
881 if (plhead != NULL) {
882 php = &plhead->pp->port_pollhd;
883 events = plhead->pevents;
884 goto retry;
889 * This function is called to inform a thread that
890 * an event being polled for has occurred.
891 * The pollstate lock on the thread should be held on entry.
893 void
894 pollnotify(pollcache_t *pcp, int fd)
896 ASSERT(fd < pcp->pc_mapsize);
897 ASSERT(MUTEX_HELD(&pcp->pc_lock));
898 BT_SET(pcp->pc_bitmap, fd);
899 pcp->pc_flag |= T_POLLWAKE;
900 cv_signal(&pcp->pc_cv);
904 * add a polldat entry to pollhead ph_list. The polldat struct is used
905 * by pollwakeup to wake sleeping pollers when polled events has happened.
907 void
908 pollhead_insert(pollhead_t *php, polldat_t *pdp)
910 PH_ENTER(php);
911 ASSERT(pdp->pd_next == NULL);
912 #ifdef DEBUG
915 * the polldat should not be already on the list
917 polldat_t *wp;
918 for (wp = php->ph_list; wp; wp = wp->pd_next) {
919 ASSERT(wp != pdp);
922 #endif /* DEBUG */
923 pdp->pd_next = php->ph_list;
924 php->ph_list = pdp;
925 PH_EXIT(php);
929 * Delete the polldat entry from ph_list.
931 void
932 pollhead_delete(pollhead_t *php, polldat_t *pdp)
934 polldat_t *wp;
935 polldat_t **wpp;
937 PH_ENTER(php);
938 for (wpp = &php->ph_list; (wp = *wpp) != NULL; wpp = &wp->pd_next) {
939 if (wp == pdp) {
940 *wpp = pdp->pd_next;
941 pdp->pd_next = NULL;
942 break;
945 #ifdef DEBUG
946 /* assert that pdp is no longer in the list */
947 for (wp = *wpp; wp; wp = wp->pd_next) {
948 ASSERT(wp != pdp);
950 #endif /* DEBUG */
951 PH_EXIT(php);
955 * walk through the poll fd lists to see if they are identical. This is an
956 * expensive operation and should not be done more than once for each poll()
957 * call.
959 * As an optimization (i.e., not having to go through the lists more than
960 * once), this routine also clear the revents field of pollfd in 'current'.
961 * Zeroing out the revents field of each entry in current poll list is
962 * required by poll man page.
964 * Since the events field of cached list has illegal poll events filtered
965 * out, the current list applies the same filtering before comparison.
967 * The routine stops when it detects a meaningful difference, or when it
968 * exhausts the lists.
971 pcacheset_cmp(pollfd_t *current, pollfd_t *cached, pollfd_t *newlist, int n)
973 int ix;
975 for (ix = 0; ix < n; ix++) {
976 /* Prefetch 64 bytes worth of 8-byte elements */
977 if ((ix & 0x7) == 0) {
978 prefetch_write_many((caddr_t)&current[ix + 8]);
979 prefetch_write_many((caddr_t)&cached[ix + 8]);
981 if (current[ix].fd == cached[ix].fd) {
983 * Filter out invalid poll events while we are in
984 * inside the loop.
986 if (current[ix].events & ~VALID_POLL_EVENTS) {
987 current[ix].events &= VALID_POLL_EVENTS;
988 if (newlist != NULL)
989 newlist[ix].events = current[ix].events;
991 if (current[ix].events == cached[ix].events) {
992 current[ix].revents = 0;
993 continue;
996 if ((current[ix].fd < 0) && (cached[ix].fd < 0)) {
997 current[ix].revents = 0;
998 continue;
1000 return (ix);
1002 return (ix);
1006 * This routine returns a pointer to a cached poll fd entry, or NULL if it
1007 * does not find it in the hash table.
1009 polldat_t *
1010 pcache_lookup_fd(pollcache_t *pcp, int fd)
1012 int hashindex;
1013 polldat_t *pdp;
1015 hashindex = POLLHASH(pcp->pc_hashsize, fd);
1016 pdp = pcp->pc_hash[hashindex];
1017 while (pdp != NULL) {
1018 if (pdp->pd_fd == fd)
1019 break;
1020 pdp = pdp->pd_hashnext;
1022 return (pdp);
1025 polldat_t *
1026 pcache_alloc_fd(int nsets)
1028 polldat_t *pdp;
1030 pdp = kmem_zalloc(sizeof (polldat_t), KM_SLEEP);
1031 if (nsets > 0) {
1032 pdp->pd_ref = kmem_zalloc(sizeof (xref_t) * nsets, KM_SLEEP);
1033 pdp->pd_nsets = nsets;
1035 return (pdp);
1039 * This routine inserts a polldat into the pollcache's hash table. It
1040 * may be necessary to grow the size of the hash table.
1042 void
1043 pcache_insert_fd(pollcache_t *pcp, polldat_t *pdp, nfds_t nfds)
1045 int hashindex;
1046 int fd;
1048 if ((pcp->pc_fdcount > pcp->pc_hashsize * POLLHASHTHRESHOLD) ||
1049 (nfds > pcp->pc_hashsize * POLLHASHTHRESHOLD)) {
1050 pcache_grow_hashtbl(pcp, nfds);
1052 fd = pdp->pd_fd;
1053 hashindex = POLLHASH(pcp->pc_hashsize, fd);
1054 pdp->pd_hashnext = pcp->pc_hash[hashindex];
1055 pcp->pc_hash[hashindex] = pdp;
1056 pcp->pc_fdcount++;
1058 #ifdef DEBUG
1061 * same fd should not appear on a hash list twice
1063 polldat_t *pdp1;
1064 for (pdp1 = pdp->pd_hashnext; pdp1; pdp1 = pdp1->pd_hashnext) {
1065 ASSERT(pdp->pd_fd != pdp1->pd_fd);
1068 #endif /* DEBUG */
1072 * Grow the hash table -- either double the table size or round it to the
1073 * nearest multiples of POLLHASHCHUNKSZ, whichever is bigger. Rehash all the
1074 * elements on the hash table.
1076 void
1077 pcache_grow_hashtbl(pollcache_t *pcp, nfds_t nfds)
1079 int oldsize;
1080 polldat_t **oldtbl;
1081 polldat_t *pdp, *pdp1;
1082 int i;
1083 #ifdef DEBUG
1084 int count = 0;
1085 #endif
1087 ASSERT(pcp->pc_hashsize % POLLHASHCHUNKSZ == 0);
1088 oldsize = pcp->pc_hashsize;
1089 oldtbl = pcp->pc_hash;
1090 if (nfds > pcp->pc_hashsize * POLLHASHINC) {
1091 pcp->pc_hashsize = (nfds + POLLHASHCHUNKSZ - 1) &
1092 ~(POLLHASHCHUNKSZ - 1);
1093 } else {
1094 pcp->pc_hashsize = pcp->pc_hashsize * POLLHASHINC;
1096 pcp->pc_hash = kmem_zalloc(pcp->pc_hashsize * sizeof (polldat_t *),
1097 KM_SLEEP);
1099 * rehash existing elements
1101 pcp->pc_fdcount = 0;
1102 for (i = 0; i < oldsize; i++) {
1103 pdp = oldtbl[i];
1104 while (pdp != NULL) {
1105 pdp1 = pdp->pd_hashnext;
1106 pcache_insert_fd(pcp, pdp, nfds);
1107 pdp = pdp1;
1108 #ifdef DEBUG
1109 count++;
1110 #endif
1113 kmem_free(oldtbl, oldsize * sizeof (polldat_t *));
1114 ASSERT(pcp->pc_fdcount == count);
1117 void
1118 pcache_grow_map(pollcache_t *pcp, int fd)
1120 int newsize;
1121 ulong_t *newmap;
1124 * grow to nearest multiple of POLLMAPCHUNK, assuming POLLMAPCHUNK is
1125 * power of 2.
1127 newsize = (fd + POLLMAPCHUNK) & ~(POLLMAPCHUNK - 1);
1128 newmap = kmem_zalloc((newsize / BT_NBIPUL) * sizeof (ulong_t),
1129 KM_SLEEP);
1131 * don't want pollwakeup to set a bit while growing the bitmap.
1133 ASSERT(mutex_owned(&pcp->pc_lock) == 0);
1134 mutex_enter(&pcp->pc_lock);
1135 bcopy(pcp->pc_bitmap, newmap,
1136 (pcp->pc_mapsize / BT_NBIPUL) * sizeof (ulong_t));
1137 kmem_free(pcp->pc_bitmap,
1138 (pcp->pc_mapsize /BT_NBIPUL) * sizeof (ulong_t));
1139 pcp->pc_bitmap = newmap;
1140 pcp->pc_mapsize = newsize;
1141 mutex_exit(&pcp->pc_lock);
1145 * remove all the reference from pollhead list and fpollinfo lists.
1147 void
1148 pcache_clean(pollcache_t *pcp)
1150 int i;
1151 polldat_t **hashtbl;
1152 polldat_t *pdp;
1154 ASSERT(MUTEX_HELD(&curthread->t_pollstate->ps_lock));
1155 hashtbl = pcp->pc_hash;
1156 for (i = 0; i < pcp->pc_hashsize; i++) {
1157 for (pdp = hashtbl[i]; pdp; pdp = pdp->pd_hashnext) {
1158 if (pdp->pd_php != NULL) {
1159 pollhead_delete(pdp->pd_php, pdp);
1160 pdp->pd_php = NULL;
1162 if (pdp->pd_fp != NULL) {
1163 delfpollinfo(pdp->pd_fd);
1164 pdp->pd_fp = NULL;
1170 void
1171 pcacheset_invalidate(pollstate_t *ps, polldat_t *pdp)
1173 int i;
1174 int fd = pdp->pd_fd;
1177 * we come here because an earlier close() on this cached poll fd.
1179 ASSERT(pdp->pd_fp == NULL);
1180 ASSERT(MUTEX_HELD(&ps->ps_lock));
1181 pdp->pd_events = 0;
1182 for (i = 0; i < ps->ps_nsets; i++) {
1183 xref_t *refp;
1184 pollcacheset_t *pcsp;
1186 ASSERT(pdp->pd_ref != NULL);
1187 refp = &pdp->pd_ref[i];
1188 if (refp->xf_refcnt) {
1189 ASSERT(refp->xf_position >= 0);
1190 pcsp = &ps->ps_pcacheset[i];
1191 if (refp->xf_refcnt == 1) {
1192 pcsp->pcs_pollfd[refp->xf_position].fd = -1;
1193 refp->xf_refcnt = 0;
1194 pdp->pd_count--;
1195 } else if (refp->xf_refcnt > 1) {
1196 int j;
1199 * turn off every appearance in pcs_pollfd list
1201 for (j = refp->xf_position;
1202 j < pcsp->pcs_nfds; j++) {
1203 if (pcsp->pcs_pollfd[j].fd == fd) {
1204 pcsp->pcs_pollfd[j].fd = -1;
1205 refp->xf_refcnt--;
1206 pdp->pd_count--;
1210 ASSERT(refp->xf_refcnt == 0);
1211 refp->xf_position = POLLPOSINVAL;
1214 ASSERT(pdp->pd_count == 0);
1218 * Insert poll fd into the pollcache, and add poll registration.
1219 * This routine is called after getf() and before releasef(). So the vnode
1220 * can not disappear even if we block here.
1221 * If there is an error, the polled fd is not cached.
1224 pcache_insert(pollstate_t *ps, file_t *fp, pollfd_t *pollfdp, int *fdcntp,
1225 ssize_t pos, int which)
1227 pollcache_t *pcp = ps->ps_pcache;
1228 polldat_t *pdp;
1229 int error;
1230 int fd;
1231 pollhead_t *memphp = NULL;
1232 xref_t *refp;
1233 int newpollfd = 0;
1235 ASSERT(MUTEX_HELD(&ps->ps_lock));
1237 * The poll caching uses the existing VOP_POLL interface. If there
1238 * is no polled events, we want the polled device to set its "some
1239 * one is sleeping in poll" flag. When the polled events happen
1240 * later, the driver will call pollwakeup(). We achieve this by
1241 * always passing 0 in the third parameter ("anyyet") when calling
1242 * VOP_POLL. This parameter is not looked at by drivers when the
1243 * polled events exist. If a driver chooses to ignore this parameter
1244 * and call pollwakeup whenever the polled events happen, that will
1245 * be OK too.
1247 ASSERT(curthread->t_pollcache == NULL);
1248 error = VOP_POLL(fp->f_vnode, pollfdp->events, 0, &pollfdp->revents,
1249 &memphp, NULL);
1250 if (error) {
1251 return (error);
1253 if (pollfdp->revents) {
1254 (*fdcntp)++;
1257 * polling the underlying device succeeded. Now we can cache it.
1258 * A close can't come in here because we have not done a releasef()
1259 * yet.
1261 fd = pollfdp->fd;
1262 pdp = pcache_lookup_fd(pcp, fd);
1263 if (pdp == NULL) {
1264 ASSERT(ps->ps_nsets > 0);
1265 pdp = pcache_alloc_fd(ps->ps_nsets);
1266 newpollfd = 1;
1269 * If this entry was used to cache a poll fd which was closed, and
1270 * this entry has not been cleaned, do it now.
1272 if ((pdp->pd_count > 0) && (pdp->pd_fp == NULL)) {
1273 pcacheset_invalidate(ps, pdp);
1274 ASSERT(pdp->pd_next == NULL);
1276 if (pdp->pd_count == 0) {
1277 pdp->pd_fd = fd;
1278 pdp->pd_fp = fp;
1279 addfpollinfo(fd);
1280 pdp->pd_thread = curthread;
1281 pdp->pd_pcache = pcp;
1283 * the entry is never used or cleared by removing a cached
1284 * pollfd (pcache_delete_fd). So all the fields should be clear.
1286 ASSERT(pdp->pd_next == NULL);
1290 * A polled fd is considered cached. So there should be a fpollinfo
1291 * entry on uf_fpollinfo list.
1293 ASSERT(infpollinfo(fd));
1295 * If there is an inconsistency, we want to know it here.
1297 ASSERT(pdp->pd_fp == fp);
1300 * XXX pd_events is a union of all polled events on this fd, possibly
1301 * by different threads. Unless this is a new first poll(), pd_events
1302 * never shrinks. If an event is no longer polled by a process, there
1303 * is no way to cancel that event. In that case, poll degrade to its
1304 * old form -- polling on this fd every time poll() is called. The
1305 * assumption is an app always polls the same type of events.
1307 pdp->pd_events |= pollfdp->events;
1309 pdp->pd_count++;
1311 * There is not much special handling for multiple appearances of
1312 * same fd other than xf_position always recording the first
1313 * appearance in poll list. If this is called from pcacheset_cache_list,
1314 * a VOP_POLL is called on every pollfd entry; therefore each
1315 * revents and fdcnt should be set correctly. If this is called from
1316 * pcacheset_resolve, we don't care about fdcnt here. Pollreadmap will
1317 * pick up the right count and handle revents field of each pollfd
1318 * entry.
1320 ASSERT(pdp->pd_ref != NULL);
1321 refp = &pdp->pd_ref[which];
1322 if (refp->xf_refcnt == 0) {
1323 refp->xf_position = pos;
1324 } else {
1326 * xf_position records the fd's first appearance in poll list
1328 if (pos < refp->xf_position) {
1329 refp->xf_position = pos;
1332 ASSERT(pollfdp->fd == ps->ps_pollfd[refp->xf_position].fd);
1333 refp->xf_refcnt++;
1334 if (fd >= pcp->pc_mapsize) {
1335 pcache_grow_map(pcp, fd);
1337 if (fd > pcp->pc_mapend) {
1338 pcp->pc_mapend = fd;
1340 if (newpollfd != 0) {
1341 pcache_insert_fd(ps->ps_pcache, pdp, ps->ps_nfds);
1343 if (memphp) {
1344 if (pdp->pd_php == NULL) {
1345 pollhead_insert(memphp, pdp);
1346 pdp->pd_php = memphp;
1347 } else {
1348 if (memphp != pdp->pd_php) {
1350 * layered devices (e.g. console driver)
1351 * may change the vnode and thus the pollhead
1352 * pointer out from underneath us.
1354 pollhead_delete(pdp->pd_php, pdp);
1355 pollhead_insert(memphp, pdp);
1356 pdp->pd_php = memphp;
1361 * Since there is a considerable window between VOP_POLL and when
1362 * we actually put the polldat struct on the pollhead list, we could
1363 * miss a pollwakeup. In the case of polling additional events, we
1364 * don't update the events until after VOP_POLL. So we could miss
1365 * pollwakeup there too. So we always set the bit here just to be
1366 * safe. The real performance gain is in subsequent pcache_poll.
1368 mutex_enter(&pcp->pc_lock);
1369 BT_SET(pcp->pc_bitmap, fd);
1370 mutex_exit(&pcp->pc_lock);
1371 return (0);
1375 * The entry is not really deleted. The fields are cleared so that the
1376 * entry is no longer useful, but it will remain in the hash table for reuse
1377 * later. It will be freed when the polling lwp exits.
1380 pcache_delete_fd(pollstate_t *ps, int fd, size_t pos, int which, uint_t cevent)
1382 pollcache_t *pcp = ps->ps_pcache;
1383 polldat_t *pdp;
1384 xref_t *refp;
1386 ASSERT(fd < pcp->pc_mapsize);
1387 ASSERT(MUTEX_HELD(&ps->ps_lock));
1389 pdp = pcache_lookup_fd(pcp, fd);
1390 ASSERT(pdp != NULL);
1391 ASSERT(pdp->pd_count > 0);
1392 ASSERT(pdp->pd_ref != NULL);
1393 refp = &pdp->pd_ref[which];
1394 if (pdp->pd_count == 1) {
1395 pdp->pd_events = 0;
1396 refp->xf_position = POLLPOSINVAL;
1397 ASSERT(refp->xf_refcnt == 1);
1398 refp->xf_refcnt = 0;
1399 if (pdp->pd_php) {
1401 * It is possible for a wakeup thread to get ahead
1402 * of the following pollhead_delete and set the bit in
1403 * bitmap. It is OK because the bit will be cleared
1404 * here anyway.
1406 pollhead_delete(pdp->pd_php, pdp);
1407 pdp->pd_php = NULL;
1409 pdp->pd_count = 0;
1410 if (pdp->pd_fp != NULL) {
1411 pdp->pd_fp = NULL;
1412 delfpollinfo(fd);
1414 mutex_enter(&pcp->pc_lock);
1415 BT_CLEAR(pcp->pc_bitmap, fd);
1416 mutex_exit(&pcp->pc_lock);
1417 return (0);
1419 if ((cevent & POLLCLOSED) == POLLCLOSED) {
1421 * fd cached here has been closed. This is the first
1422 * pcache_delete_fd called after the close. Clean up the
1423 * entire entry.
1425 pcacheset_invalidate(ps, pdp);
1426 ASSERT(pdp->pd_php == NULL);
1427 mutex_enter(&pcp->pc_lock);
1428 BT_CLEAR(pcp->pc_bitmap, fd);
1429 mutex_exit(&pcp->pc_lock);
1430 return (0);
1432 #ifdef DEBUG
1433 if (getf(fd) != NULL) {
1434 ASSERT(infpollinfo(fd));
1435 releasef(fd);
1437 #endif /* DEBUG */
1438 pdp->pd_count--;
1439 ASSERT(refp->xf_refcnt > 0);
1440 if (--refp->xf_refcnt == 0) {
1441 refp->xf_position = POLLPOSINVAL;
1442 } else {
1443 ASSERT(pos >= refp->xf_position);
1444 if (pos == refp->xf_position) {
1446 * The xref position is no longer valid.
1447 * Reset it to a special value and let
1448 * caller know it needs to updatexref()
1449 * with a new xf_position value.
1451 refp->xf_position = POLLPOSTRANS;
1452 return (1);
1455 return (0);
1458 void
1459 pcache_update_xref(pollcache_t *pcp, int fd, ssize_t pos, int which)
1461 polldat_t *pdp;
1463 pdp = pcache_lookup_fd(pcp, fd);
1464 ASSERT(pdp != NULL);
1465 ASSERT(pdp->pd_ref != NULL);
1466 pdp->pd_ref[which].xf_position = pos;
1469 #ifdef DEBUG
1471 * For each polled fd, it's either in the bitmap or cached in
1472 * pcache hash table. If this routine returns 0, something is wrong.
1474 static int
1475 pollchecksanity(pollstate_t *ps, nfds_t nfds)
1477 int i;
1478 int fd;
1479 pollcache_t *pcp = ps->ps_pcache;
1480 polldat_t *pdp;
1481 pollfd_t *pollfdp = ps->ps_pollfd;
1482 file_t *fp;
1484 ASSERT(MUTEX_HELD(&ps->ps_lock));
1485 for (i = 0; i < nfds; i++) {
1486 fd = pollfdp[i].fd;
1487 if (fd < 0) {
1488 ASSERT(pollfdp[i].revents == 0);
1489 continue;
1491 if (pollfdp[i].revents == POLLNVAL)
1492 continue;
1493 if ((fp = getf(fd)) == NULL)
1494 continue;
1495 pdp = pcache_lookup_fd(pcp, fd);
1496 ASSERT(pdp != NULL);
1497 ASSERT(infpollinfo(fd));
1498 ASSERT(pdp->pd_fp == fp);
1499 releasef(fd);
1500 if (BT_TEST(pcp->pc_bitmap, fd))
1501 continue;
1502 if (pdp->pd_php == NULL)
1503 return (0);
1505 return (1);
1507 #endif /* DEBUG */
1510 * resolve the difference between the current poll list and a cached one.
1513 pcacheset_resolve(pollstate_t *ps, nfds_t nfds, int *fdcntp, int which)
1515 int i;
1516 pollcache_t *pcp = ps->ps_pcache;
1517 pollfd_t *newlist = NULL;
1518 pollfd_t *current = ps->ps_pollfd;
1519 pollfd_t *cached;
1520 pollcacheset_t *pcsp;
1521 int common;
1522 int count = 0;
1523 int offset;
1524 int remain;
1525 int fd;
1526 file_t *fp;
1527 int fdcnt = 0;
1528 int cnt = 0;
1529 nfds_t old_nfds;
1530 int error = 0;
1531 int mismatch = 0;
1533 ASSERT(MUTEX_HELD(&ps->ps_lock));
1534 #ifdef DEBUG
1535 checkpolldat(ps);
1536 #endif
1537 pcsp = &ps->ps_pcacheset[which];
1538 old_nfds = pcsp->pcs_nfds;
1539 common = (nfds > old_nfds) ? old_nfds : nfds;
1540 if (nfds != old_nfds) {
1542 * the length of poll list has changed. allocate a new
1543 * pollfd list.
1545 newlist = kmem_alloc(nfds * sizeof (pollfd_t), KM_SLEEP);
1546 bcopy(current, newlist, sizeof (pollfd_t) * nfds);
1549 * Compare the overlapping part of the current fd list with the
1550 * cached one. Whenever a difference is found, resolve it.
1551 * The comparison is done on the current poll list and the
1552 * cached list. But we may be setting up the newlist to be the
1553 * cached list for next poll.
1555 cached = pcsp->pcs_pollfd;
1556 remain = common;
1558 while (count < common) {
1559 int tmpfd;
1560 pollfd_t *np;
1562 np = (newlist != NULL) ? &newlist[count] : NULL;
1563 offset = pcacheset_cmp(&current[count], &cached[count], np,
1564 remain);
1566 * Collect stats. If lists are completed the first time,
1567 * it's a hit. Otherwise, it's a partial hit or miss.
1569 if ((count == 0) && (offset == common)) {
1570 pollstats.pollcachehit.value.ui64++;
1571 } else {
1572 mismatch++;
1574 count += offset;
1575 if (offset < remain) {
1576 ASSERT(count < common);
1577 ASSERT((current[count].fd != cached[count].fd) ||
1578 (current[count].events != cached[count].events));
1580 * Filter out invalid events.
1582 if (current[count].events & ~VALID_POLL_EVENTS) {
1583 if (newlist != NULL) {
1584 newlist[count].events =
1585 current[count].events &=
1586 VALID_POLL_EVENTS;
1587 } else {
1588 current[count].events &=
1589 VALID_POLL_EVENTS;
1593 * when resolving a difference, we always remove the
1594 * fd from cache before inserting one into cache.
1596 if (cached[count].fd >= 0) {
1597 tmpfd = cached[count].fd;
1598 if (pcache_delete_fd(ps, tmpfd, count, which,
1599 (uint_t)cached[count].events)) {
1601 * This should be rare but needed for
1602 * correctness.
1604 * The first appearance in cached list
1605 * is being "turned off". The same fd
1606 * appear more than once in the cached
1607 * poll list. Find the next one on the
1608 * list and update the cached
1609 * xf_position field.
1611 for (i = count + 1; i < old_nfds; i++) {
1612 if (cached[i].fd == tmpfd) {
1613 pcache_update_xref(pcp,
1614 tmpfd, (ssize_t)i,
1615 which);
1616 break;
1619 ASSERT(i <= old_nfds);
1622 * In case a new cache list is allocated,
1623 * need to keep both cache lists in sync
1624 * b/c the new one can be freed if we have
1625 * an error later.
1627 cached[count].fd = -1;
1628 if (newlist != NULL) {
1629 newlist[count].fd = -1;
1632 if ((tmpfd = current[count].fd) >= 0) {
1634 * add to the cached fd tbl and bitmap.
1636 if ((fp = getf(tmpfd)) == NULL) {
1637 current[count].revents = POLLNVAL;
1638 if (newlist != NULL) {
1639 newlist[count].fd = -1;
1641 cached[count].fd = -1;
1642 fdcnt++;
1643 } else {
1645 * Here we don't care about the
1646 * fdcnt. We will examine the bitmap
1647 * later and pick up the correct
1648 * fdcnt there. So we never bother
1649 * to check value of 'cnt'.
1651 error = pcache_insert(ps, fp,
1652 &current[count], &cnt,
1653 (ssize_t)count, which);
1655 * if no error, we want to do releasef
1656 * after we updated cache poll list
1657 * entry so that close() won't race
1658 * us.
1660 if (error) {
1662 * If we encountered an error,
1663 * we have invalidated an
1664 * entry in cached poll list
1665 * (in pcache_delete_fd() above)
1666 * but failed to add one here.
1667 * This is OK b/c what's in the
1668 * cached list is consistent
1669 * with content of cache.
1670 * It will not have any ill
1671 * effect on next poll().
1673 releasef(tmpfd);
1674 if (newlist != NULL) {
1675 kmem_free(newlist,
1676 nfds *
1677 sizeof (pollfd_t));
1679 return (error);
1682 * If we have allocated a new(temp)
1683 * cache list, we need to keep both
1684 * in sync b/c the new one can be freed
1685 * if we have an error later.
1687 if (newlist != NULL) {
1688 newlist[count].fd =
1689 current[count].fd;
1690 newlist[count].events =
1691 current[count].events;
1693 cached[count].fd = current[count].fd;
1694 cached[count].events =
1695 current[count].events;
1696 releasef(tmpfd);
1698 } else {
1699 current[count].revents = 0;
1701 count++;
1702 remain = common - count;
1705 if (mismatch != 0) {
1706 if (mismatch == common) {
1707 pollstats.pollcachemiss.value.ui64++;
1708 } else {
1709 pollstats.pollcachephit.value.ui64++;
1713 * take care of the non overlapping part of a list
1715 if (nfds > old_nfds) {
1716 ASSERT(newlist != NULL);
1717 for (i = old_nfds; i < nfds; i++) {
1718 /* filter out invalid events */
1719 if (current[i].events & ~VALID_POLL_EVENTS) {
1720 newlist[i].events = current[i].events =
1721 current[i].events & VALID_POLL_EVENTS;
1723 if ((fd = current[i].fd) < 0) {
1724 current[i].revents = 0;
1725 continue;
1728 * add to the cached fd tbl and bitmap.
1730 if ((fp = getf(fd)) == NULL) {
1731 current[i].revents = POLLNVAL;
1732 newlist[i].fd = -1;
1733 fdcnt++;
1734 continue;
1737 * Here we don't care about the
1738 * fdcnt. We will examine the bitmap
1739 * later and pick up the correct
1740 * fdcnt there. So we never bother to
1741 * check 'cnt'.
1743 error = pcache_insert(ps, fp, &current[i], &cnt,
1744 (ssize_t)i, which);
1745 releasef(fd);
1746 if (error) {
1748 * Here we are half way through adding newly
1749 * polled fd. Undo enough to keep the cache
1750 * list consistent with the cache content.
1752 pcacheset_remove_list(ps, current, old_nfds,
1753 i, which, 0);
1754 kmem_free(newlist, nfds * sizeof (pollfd_t));
1755 return (error);
1759 if (old_nfds > nfds) {
1761 * remove the fd's which are no longer polled.
1763 pcacheset_remove_list(ps, pcsp->pcs_pollfd, nfds, old_nfds,
1764 which, 1);
1767 * set difference resolved. update nfds and cachedlist
1768 * in pollstate struct.
1770 if (newlist != NULL) {
1771 kmem_free(pcsp->pcs_pollfd, old_nfds * sizeof (pollfd_t));
1773 * By now, the pollfd.revents field should
1774 * all be zeroed.
1776 pcsp->pcs_pollfd = newlist;
1777 pcsp->pcs_nfds = nfds;
1779 ASSERT(*fdcntp == 0);
1780 *fdcntp = fdcnt;
1782 * By now for every fd in pollfdp, one of the following should be
1783 * true. Otherwise we will miss a polled event.
1785 * 1. the bit corresponding to the fd in bitmap is set. So VOP_POLL
1786 * will be called on this fd in next poll.
1787 * 2. the fd is cached in the pcache (i.e. pd_php is set). So
1788 * pollnotify will happen.
1790 ASSERT(pollchecksanity(ps, nfds));
1792 * make sure cross reference between cached poll lists and cached
1793 * poll fds are correct.
1795 ASSERT(pollcheckxref(ps, which));
1797 * ensure each polldat in pollcache reference a polled fd in
1798 * pollcacheset.
1800 #ifdef DEBUG
1801 checkpolldat(ps);
1802 #endif
1803 return (0);
1806 #ifdef DEBUG
1807 static int
1808 pollscanrevents(pollcache_t *pcp, pollfd_t *pollfdp, nfds_t nfds)
1810 int i;
1811 int reventcnt = 0;
1813 for (i = 0; i < nfds; i++) {
1814 if (pollfdp[i].fd < 0) {
1815 ASSERT(pollfdp[i].revents == 0);
1816 continue;
1818 if (pollfdp[i].revents) {
1819 reventcnt++;
1821 if (pollfdp[i].revents && (pollfdp[i].revents != POLLNVAL)) {
1822 ASSERT(BT_TEST(pcp->pc_bitmap, pollfdp[i].fd));
1825 return (reventcnt);
1827 #endif /* DEBUG */
1830 * read the bitmap and poll on fds corresponding to the '1' bits. The ps_lock
1831 * is held upon entry.
1834 pcache_poll(pollfd_t *pollfdp, pollstate_t *ps, nfds_t nfds, int *fdcntp,
1835 int which)
1837 int i;
1838 pollcache_t *pcp;
1839 int fd;
1840 int begin, end, done;
1841 pollhead_t *php;
1842 int fdcnt;
1843 int error = 0;
1844 file_t *fp;
1845 polldat_t *pdp;
1846 xref_t *refp;
1847 int entry;
1849 pcp = ps->ps_pcache;
1850 ASSERT(MUTEX_HELD(&ps->ps_lock));
1851 ASSERT(MUTEX_HELD(&pcp->pc_lock));
1852 retry:
1853 done = 0;
1854 begin = 0;
1855 fdcnt = 0;
1856 end = pcp->pc_mapend;
1857 while ((fdcnt < nfds) && !done) {
1858 php = NULL;
1860 * only poll fds which may have events
1862 fd = bt_getlowbit(pcp->pc_bitmap, begin, end);
1863 ASSERT(fd <= end);
1864 if (fd >= 0) {
1865 ASSERT(pollcheckrevents(ps, begin, fd, which));
1867 * adjust map pointers for next round
1869 if (fd == end) {
1870 done = 1;
1871 } else {
1872 begin = fd + 1;
1875 * A bitmap caches poll state information of
1876 * multiple poll lists. Call VOP_POLL only if
1877 * the bit corresponds to an fd in this poll
1878 * list.
1880 pdp = pcache_lookup_fd(pcp, fd);
1881 ASSERT(pdp != NULL);
1882 ASSERT(pdp->pd_ref != NULL);
1883 refp = &pdp->pd_ref[which];
1884 if (refp->xf_refcnt == 0)
1885 continue;
1886 entry = refp->xf_position;
1887 ASSERT((entry >= 0) && (entry < nfds));
1888 ASSERT(pollfdp[entry].fd == fd);
1890 * we are in this routine implies that we have
1891 * successfully polled this fd in the past.
1892 * Check to see this fd is closed while we are
1893 * blocked in poll. This ensures that we don't
1894 * miss a close on the fd in the case this fd is
1895 * reused.
1897 if (pdp->pd_fp == NULL) {
1898 ASSERT(pdp->pd_count > 0);
1899 pollfdp[entry].revents = POLLNVAL;
1900 fdcnt++;
1901 if (refp->xf_refcnt > 1) {
1903 * this fd appeared multiple time
1904 * in the poll list. Find all of them.
1906 for (i = entry + 1; i < nfds; i++) {
1907 if (pollfdp[i].fd == fd) {
1908 pollfdp[i].revents =
1909 POLLNVAL;
1910 fdcnt++;
1914 pcacheset_invalidate(ps, pdp);
1915 continue;
1918 * We can be here polling a device that is being
1919 * closed (i.e. the file pointer is set to NULL,
1920 * but pollcacheclean has not happened yet).
1922 if ((fp = getf(fd)) == NULL) {
1923 pollfdp[entry].revents = POLLNVAL;
1924 fdcnt++;
1925 if (refp->xf_refcnt > 1) {
1927 * this fd appeared multiple time
1928 * in the poll list. Find all of them.
1930 for (i = entry + 1; i < nfds; i++) {
1931 if (pollfdp[i].fd == fd) {
1932 pollfdp[i].revents =
1933 POLLNVAL;
1934 fdcnt++;
1938 continue;
1940 ASSERT(pdp->pd_fp == fp);
1941 ASSERT(infpollinfo(fd));
1943 * Since we no longer hold poll head lock across
1944 * VOP_POLL, pollunlock logic can be simplifed.
1946 ASSERT(pdp->pd_php == NULL ||
1947 MUTEX_NOT_HELD(PHLOCK(pdp->pd_php)));
1949 * underlying file systems may set a "pollpending"
1950 * flag when it sees the poll may block. Pollwakeup()
1951 * is called by wakeup thread if pollpending is set.
1952 * Pass a 0 fdcnt so that the underlying file system
1953 * will set the "pollpending" flag set when there is
1954 * no polled events.
1956 * Use pollfdp[].events for actual polling because
1957 * the pd_events is union of all cached poll events
1958 * on this fd. The events parameter also affects
1959 * how the polled device sets the "poll pending"
1960 * flag.
1962 ASSERT(curthread->t_pollcache == NULL);
1963 error = VOP_POLL(fp->f_vnode, pollfdp[entry].events, 0,
1964 &pollfdp[entry].revents, &php, NULL);
1966 * releasef after completely done with this cached
1967 * poll entry. To prevent close() coming in to clear
1968 * this entry.
1970 if (error) {
1971 releasef(fd);
1972 break;
1975 * layered devices (e.g. console driver)
1976 * may change the vnode and thus the pollhead
1977 * pointer out from underneath us.
1979 if (php != NULL && pdp->pd_php != NULL &&
1980 php != pdp->pd_php) {
1981 releasef(fd);
1982 pollhead_delete(pdp->pd_php, pdp);
1983 pdp->pd_php = php;
1984 pollhead_insert(php, pdp);
1986 * We could have missed a wakeup on the new
1987 * target device. Make sure the new target
1988 * gets polled once.
1990 BT_SET(pcp->pc_bitmap, fd);
1991 goto retry;
1994 if (pollfdp[entry].revents) {
1995 ASSERT(refp->xf_refcnt >= 1);
1996 fdcnt++;
1997 if (refp->xf_refcnt > 1) {
1999 * this fd appeared multiple time
2000 * in the poll list. This is rare but
2001 * we have to look at all of them for
2002 * correctness.
2004 error = plist_chkdupfd(fp, pdp, ps,
2005 pollfdp, entry, &fdcnt);
2006 if (error > 0) {
2007 releasef(fd);
2008 break;
2010 if (error < 0) {
2011 goto retry;
2014 releasef(fd);
2015 } else {
2017 * VOP_POLL didn't return any revents. We can
2018 * clear the bit in bitmap only if we have the
2019 * pollhead ptr cached and no other cached
2020 * entry is polling different events on this fd.
2021 * VOP_POLL may have dropped the ps_lock. Make
2022 * sure pollwakeup has not happened before clear
2023 * the bit.
2025 if ((pdp->pd_php != NULL) &&
2026 (pollfdp[entry].events == pdp->pd_events) &&
2027 ((pcp->pc_flag & T_POLLWAKE) == 0)) {
2028 BT_CLEAR(pcp->pc_bitmap, fd);
2031 * if the fd can be cached now but not before,
2032 * do it now.
2034 if ((pdp->pd_php == NULL) && (php != NULL)) {
2035 pdp->pd_php = php;
2036 pollhead_insert(php, pdp);
2038 * We are inserting a polldat struct for
2039 * the first time. We may have missed a
2040 * wakeup on this device. Re-poll once.
2041 * This should be a rare event.
2043 releasef(fd);
2044 goto retry;
2046 if (refp->xf_refcnt > 1) {
2048 * this fd appeared multiple time
2049 * in the poll list. This is rare but
2050 * we have to look at all of them for
2051 * correctness.
2053 error = plist_chkdupfd(fp, pdp, ps,
2054 pollfdp, entry, &fdcnt);
2055 if (error > 0) {
2056 releasef(fd);
2057 break;
2059 if (error < 0) {
2060 goto retry;
2063 releasef(fd);
2065 } else {
2066 done = 1;
2067 ASSERT(pollcheckrevents(ps, begin, end + 1, which));
2070 if (!error) {
2071 ASSERT(*fdcntp + fdcnt == pollscanrevents(pcp, pollfdp, nfds));
2072 *fdcntp += fdcnt;
2074 return (error);
2078 * Going through the poll list without much locking. Poll all fds and
2079 * cache all valid fds in the pollcache.
2082 pcacheset_cache_list(pollstate_t *ps, pollfd_t *fds, int *fdcntp, int which)
2084 pollfd_t *pollfdp = ps->ps_pollfd;
2085 pollcacheset_t *pcacheset = ps->ps_pcacheset;
2086 pollfd_t *newfdlist;
2087 int i;
2088 int fd;
2089 file_t *fp;
2090 int error = 0;
2092 ASSERT(MUTEX_HELD(&ps->ps_lock));
2093 ASSERT(which < ps->ps_nsets);
2094 ASSERT(pcacheset != NULL);
2095 ASSERT(pcacheset[which].pcs_pollfd == NULL);
2096 newfdlist = kmem_alloc(ps->ps_nfds * sizeof (pollfd_t), KM_SLEEP);
2098 * cache the new poll list in pollcachset.
2100 bcopy(pollfdp, newfdlist, sizeof (pollfd_t) * ps->ps_nfds);
2102 pcacheset[which].pcs_pollfd = newfdlist;
2103 pcacheset[which].pcs_nfds = ps->ps_nfds;
2104 pcacheset[which].pcs_usradr = (uintptr_t)fds;
2107 * We have saved a copy of current poll fd list in one pollcacheset.
2108 * The 'revents' field of the new list is not yet set to 0. Loop
2109 * through the new list just to do that is expensive. We do that
2110 * while polling the list.
2112 for (i = 0; i < ps->ps_nfds; i++) {
2113 fd = pollfdp[i].fd;
2115 * We also filter out the illegal poll events in the event
2116 * field for the cached poll list/set.
2118 if (pollfdp[i].events & ~VALID_POLL_EVENTS) {
2119 newfdlist[i].events = pollfdp[i].events =
2120 pollfdp[i].events & VALID_POLL_EVENTS;
2122 if (fd < 0) {
2123 pollfdp[i].revents = 0;
2124 continue;
2126 if ((fp = getf(fd)) == NULL) {
2127 pollfdp[i].revents = POLLNVAL;
2129 * invalidate this cache entry in the cached poll list
2131 newfdlist[i].fd = -1;
2132 (*fdcntp)++;
2133 continue;
2136 * cache this fd.
2138 error = pcache_insert(ps, fp, &pollfdp[i], fdcntp, (ssize_t)i,
2139 which);
2140 releasef(fd);
2141 if (error) {
2143 * Here we are half way through caching a new
2144 * poll list. Undo every thing.
2146 pcacheset_remove_list(ps, pollfdp, 0, i, which, 0);
2147 kmem_free(newfdlist, ps->ps_nfds * sizeof (pollfd_t));
2148 pcacheset[which].pcs_pollfd = NULL;
2149 pcacheset[which].pcs_usradr = NULL;
2150 break;
2153 return (error);
2157 * called by pollcacheclean() to set the fp NULL. It also sets polled events
2158 * in pcacheset entries to a special events 'POLLCLOSED'. Do a pollwakeup to
2159 * wake any sleeping poller, then remove the polldat from the driver.
2160 * The routine is called with ps_pcachelock held.
2162 void
2163 pcache_clean_entry(pollstate_t *ps, int fd)
2165 pollcache_t *pcp;
2166 polldat_t *pdp;
2167 int i;
2169 ASSERT(ps != NULL);
2170 ASSERT(MUTEX_HELD(&ps->ps_lock));
2171 pcp = ps->ps_pcache;
2172 ASSERT(pcp);
2173 pdp = pcache_lookup_fd(pcp, fd);
2174 ASSERT(pdp != NULL);
2176 * the corresponding fpollinfo in fi_list has been removed by
2177 * a close on this fd. Reset the cached fp ptr here.
2179 pdp->pd_fp = NULL;
2181 * XXX - This routine also touches data in pcacheset struct.
2183 * set the event in cached poll lists to POLLCLOSED. This invalidate
2184 * the cached poll fd entry in that poll list, which will force a
2185 * removal of this cached entry in next poll(). The cleanup is done
2186 * at the removal time.
2188 ASSERT(pdp->pd_ref != NULL);
2189 for (i = 0; i < ps->ps_nsets; i++) {
2190 xref_t *refp;
2191 pollcacheset_t *pcsp;
2193 refp = &pdp->pd_ref[i];
2194 if (refp->xf_refcnt) {
2195 ASSERT(refp->xf_position >= 0);
2196 pcsp = &ps->ps_pcacheset[i];
2197 if (refp->xf_refcnt == 1) {
2198 pcsp->pcs_pollfd[refp->xf_position].events =
2199 (short)POLLCLOSED;
2201 if (refp->xf_refcnt > 1) {
2202 int j;
2204 * mark every matching entry in pcs_pollfd
2206 for (j = refp->xf_position;
2207 j < pcsp->pcs_nfds; j++) {
2208 if (pcsp->pcs_pollfd[j].fd == fd) {
2209 pcsp->pcs_pollfd[j].events =
2210 (short)POLLCLOSED;
2216 if (pdp->pd_php) {
2217 pollwakeup(pdp->pd_php, POLLHUP);
2218 pollhead_delete(pdp->pd_php, pdp);
2219 pdp->pd_php = NULL;
2224 * This is the first time this thread has ever polled,
2225 * so we have to create its pollstate structure.
2226 * This will persist for the life of the thread,
2227 * until it calls pollcleanup().
2229 pollstate_t *
2230 pollstate_create(void)
2232 pollstate_t *ps;
2234 ps = kmem_zalloc(sizeof (pollstate_t), KM_SLEEP);
2235 ps->ps_nsets = POLLFDSETS;
2236 ps->ps_pcacheset = pcacheset_create(ps->ps_nsets);
2237 return (ps);
2240 void
2241 pollstate_destroy(pollstate_t *ps)
2243 if (ps->ps_pollfd != NULL) {
2244 kmem_free(ps->ps_pollfd, ps->ps_nfds * sizeof (pollfd_t));
2245 ps->ps_pollfd = NULL;
2247 if (ps->ps_pcache != NULL) {
2248 pcache_destroy(ps->ps_pcache);
2249 ps->ps_pcache = NULL;
2251 pcacheset_destroy(ps->ps_pcacheset, ps->ps_nsets);
2252 ps->ps_pcacheset = NULL;
2253 if (ps->ps_dpbuf != NULL) {
2254 kmem_free(ps->ps_dpbuf, ps->ps_dpbufsize * sizeof (pollfd_t));
2255 ps->ps_dpbuf = NULL;
2257 mutex_destroy(&ps->ps_lock);
2258 kmem_free(ps, sizeof (pollstate_t));
2262 * We are holding the appropriate uf_lock entering this routine.
2263 * Bump up the ps_busy count to prevent the thread from exiting.
2265 void
2266 pollblockexit(fpollinfo_t *fpip)
2268 for (; fpip; fpip = fpip->fp_next) {
2269 pollcache_t *pcp = fpip->fp_thread->t_pollstate->ps_pcache;
2271 mutex_enter(&pcp->pc_no_exit);
2272 pcp->pc_busy++; /* prevents exit()'s */
2273 mutex_exit(&pcp->pc_no_exit);
2278 * Complete phase 2 of cached poll fd cleanup. Call pcache_clean_entry to mark
2279 * the pcacheset events field POLLCLOSED to force the next poll() to remove
2280 * this cache entry. We can't clean the polldat entry clean up here because
2281 * lwp block in poll() needs the info to return. Wakeup anyone blocked in
2282 * poll and let exiting lwp go. No lock is help upon entry. So it's OK for
2283 * pcache_clean_entry to call pollwakeup().
2285 void
2286 pollcacheclean(fpollinfo_t *fip, int fd)
2288 struct fpollinfo *fpip, *fpip2;
2290 fpip = fip;
2291 while (fpip) {
2292 pollstate_t *ps = fpip->fp_thread->t_pollstate;
2293 pollcache_t *pcp = ps->ps_pcache;
2295 mutex_enter(&ps->ps_lock);
2296 pcache_clean_entry(ps, fd);
2297 mutex_exit(&ps->ps_lock);
2298 mutex_enter(&pcp->pc_no_exit);
2299 pcp->pc_busy--;
2300 if (pcp->pc_busy == 0) {
2302 * Wakeup the thread waiting in
2303 * thread_exit().
2305 cv_signal(&pcp->pc_busy_cv);
2307 mutex_exit(&pcp->pc_no_exit);
2309 fpip2 = fpip;
2310 fpip = fpip->fp_next;
2311 kmem_free(fpip2, sizeof (fpollinfo_t));
2316 * one of the cache line's counter is wrapping around. Reset all cache line
2317 * counters to zero except one. This is simplistic, but probably works
2318 * effectively.
2320 void
2321 pcacheset_reset_count(pollstate_t *ps, int index)
2323 int i;
2325 ASSERT(MUTEX_HELD(&ps->ps_lock));
2326 for (i = 0; i < ps->ps_nsets; i++) {
2327 if (ps->ps_pcacheset[i].pcs_pollfd != NULL) {
2328 ps->ps_pcacheset[i].pcs_count = 0;
2331 ps->ps_pcacheset[index].pcs_count = 1;
2335 * this routine implements poll cache list replacement policy.
2336 * It is currently choose the "least used".
2339 pcacheset_replace(pollstate_t *ps)
2341 int i;
2342 int index = 0;
2344 ASSERT(MUTEX_HELD(&ps->ps_lock));
2345 for (i = 1; i < ps->ps_nsets; i++) {
2346 if (ps->ps_pcacheset[index].pcs_count >
2347 ps->ps_pcacheset[i].pcs_count) {
2348 index = i;
2351 ps->ps_pcacheset[index].pcs_count = 0;
2352 return (index);
2356 * this routine is called by strclose to remove remaining polldat struct on
2357 * the pollhead list of the device being closed. There are two reasons as why
2358 * the polldat structures still remain on the pollhead list:
2360 * (1) The layered device(e.g.the console driver).
2361 * In this case, the existence of a polldat implies that the thread putting
2362 * the polldat on this list has not exited yet. Before the thread exits, it
2363 * will have to hold this pollhead lock to remove the polldat. So holding the
2364 * pollhead lock here effectively prevents the thread which put the polldat
2365 * on this list from exiting.
2367 * (2) /dev/poll.
2368 * When a polled fd is cached in /dev/poll, its polldat will remain on the
2369 * pollhead list if the process has not done a POLLREMOVE before closing the
2370 * polled fd. We just unlink it here.
2372 void
2373 pollhead_clean(pollhead_t *php)
2375 polldat_t *pdp;
2378 * In case(1), while we must prevent the thread in question from
2379 * exiting, we must also obey the proper locking order, i.e.
2380 * (ps_lock -> phlock).
2382 PH_ENTER(php);
2383 while (php->ph_list != NULL) {
2384 pollstate_t *ps;
2385 pollcache_t *pcp;
2387 pdp = php->ph_list;
2388 ASSERT(pdp->pd_php == php);
2389 if (pdp->pd_thread == NULL) {
2391 * This is case(2). Since the ph_lock is sufficient
2392 * to synchronize this lwp with any other /dev/poll
2393 * lwp, just unlink the polldat.
2395 php->ph_list = pdp->pd_next;
2396 pdp->pd_php = NULL;
2397 pdp->pd_next = NULL;
2398 continue;
2400 ps = pdp->pd_thread->t_pollstate;
2401 ASSERT(ps != NULL);
2402 pcp = pdp->pd_pcache;
2403 ASSERT(pcp != NULL);
2404 mutex_enter(&pcp->pc_no_exit);
2405 pcp->pc_busy++; /* prevents exit()'s */
2406 mutex_exit(&pcp->pc_no_exit);
2408 * Now get the locks in proper order to avoid deadlock.
2410 PH_EXIT(php);
2411 mutex_enter(&ps->ps_lock);
2413 * while we dropped the pollhead lock, the element could be
2414 * taken off the list already.
2416 PH_ENTER(php);
2417 if (pdp->pd_php == php) {
2418 ASSERT(pdp == php->ph_list);
2419 php->ph_list = pdp->pd_next;
2420 pdp->pd_php = NULL;
2421 pdp->pd_next = NULL;
2423 PH_EXIT(php);
2424 mutex_exit(&ps->ps_lock);
2425 mutex_enter(&pcp->pc_no_exit);
2426 pcp->pc_busy--;
2427 if (pcp->pc_busy == 0) {
2429 * Wakeup the thread waiting in
2430 * thread_exit().
2432 cv_signal(&pcp->pc_busy_cv);
2434 mutex_exit(&pcp->pc_no_exit);
2435 PH_ENTER(php);
2437 PH_EXIT(php);
2441 * The remove_list is called to cleanup a partially cached 'current' list or
2442 * to remove a partial list which is no longer cached. The flag value of 1
2443 * indicates the second case.
2445 void
2446 pcacheset_remove_list(pollstate_t *ps, pollfd_t *pollfdp, int start, int end,
2447 int cacheindex, int flag)
2449 int i;
2451 ASSERT(MUTEX_HELD(&ps->ps_lock));
2452 for (i = start; i < end; i++) {
2453 if ((pollfdp[i].fd >= 0) &&
2454 (flag || !(pollfdp[i].revents & POLLNVAL))) {
2455 if (pcache_delete_fd(ps, pollfdp[i].fd, i, cacheindex,
2456 (uint_t)pollfdp[i].events)) {
2457 int j;
2458 int fd = pollfdp[i].fd;
2460 for (j = i + 1; j < end; j++) {
2461 if (pollfdp[j].fd == fd) {
2462 pcache_update_xref(
2463 ps->ps_pcache, fd,
2464 (ssize_t)j, cacheindex);
2465 break;
2468 ASSERT(j <= end);
2474 #ifdef DEBUG
2476 #include<sys/strsubr.h>
2478 * make sure curthread is not on anyone's pollhead list any more.
2480 static void
2481 pollcheckphlist()
2483 int i;
2484 file_t *fp;
2485 uf_entry_t *ufp;
2486 uf_info_t *fip = P_FINFO(curproc);
2487 struct stdata *stp;
2488 polldat_t *pdp;
2490 mutex_enter(&fip->fi_lock);
2491 for (i = 0; i < fip->fi_nfiles; i++) {
2492 UF_ENTER(ufp, fip, i);
2493 if ((fp = ufp->uf_file) != NULL) {
2494 if ((stp = fp->f_vnode->v_stream) != NULL) {
2495 PH_ENTER(&stp->sd_pollist);
2496 pdp = stp->sd_pollist.ph_list;
2497 while (pdp) {
2498 ASSERT(pdp->pd_thread != curthread);
2499 pdp = pdp->pd_next;
2501 PH_EXIT(&stp->sd_pollist);
2504 UF_EXIT(ufp);
2506 mutex_exit(&fip->fi_lock);
2510 * for resolved set poll list, the xref info in the pcache should be
2511 * consistent with this poll list.
2513 static int
2514 pollcheckxref(pollstate_t *ps, int cacheindex)
2516 pollfd_t *pollfdp = ps->ps_pcacheset[cacheindex].pcs_pollfd;
2517 pollcache_t *pcp = ps->ps_pcache;
2518 polldat_t *pdp;
2519 int i;
2520 xref_t *refp;
2522 for (i = 0; i < ps->ps_pcacheset[cacheindex].pcs_nfds; i++) {
2523 if (pollfdp[i].fd < 0) {
2524 continue;
2526 pdp = pcache_lookup_fd(pcp, pollfdp[i].fd);
2527 ASSERT(pdp != NULL);
2528 ASSERT(pdp->pd_ref != NULL);
2529 refp = &pdp->pd_ref[cacheindex];
2530 if (refp->xf_position >= 0) {
2531 ASSERT(refp->xf_refcnt >= 1);
2532 ASSERT(pollfdp[refp->xf_position].fd == pdp->pd_fd);
2533 if (refp->xf_refcnt > 1) {
2534 int j;
2535 int count = 0;
2537 for (j = refp->xf_position;
2538 j < ps->ps_pcacheset[cacheindex].pcs_nfds;
2539 j++) {
2540 if (pollfdp[j].fd == pdp->pd_fd) {
2541 count++;
2544 ASSERT(count == refp->xf_refcnt);
2548 return (1);
2552 * For every cached pollfd, its polldat struct should be consistent with
2553 * what is in the pcacheset lists.
2555 static void
2556 checkpolldat(pollstate_t *ps)
2558 pollcache_t *pcp = ps->ps_pcache;
2559 polldat_t **hashtbl;
2560 int i;
2562 hashtbl = pcp->pc_hash;
2563 for (i = 0; i < pcp->pc_hashsize; i++) {
2564 polldat_t *pdp;
2566 for (pdp = hashtbl[i]; pdp; pdp = pdp->pd_hashnext) {
2567 ASSERT(pdp->pd_ref != NULL);
2568 if (pdp->pd_count > 0) {
2569 xref_t *refp;
2570 int j;
2571 pollcacheset_t *pcsp;
2572 pollfd_t *pollfd;
2574 for (j = 0; j < ps->ps_nsets; j++) {
2575 refp = &pdp->pd_ref[j];
2576 if (refp->xf_refcnt > 0) {
2577 pcsp = &ps->ps_pcacheset[j];
2578 ASSERT(refp->xf_position < pcsp->pcs_nfds);
2579 pollfd = pcsp->pcs_pollfd;
2580 ASSERT(pdp->pd_fd == pollfd[refp->xf_position].fd);
2589 * every wfd element on ph_list must have a corresponding fpollinfo on the
2590 * uf_fpollinfo list. This is a variation of infpollinfo() w/o holding locks.
2592 void
2593 checkwfdlist(vnode_t *vp, fpollinfo_t *fpip)
2595 stdata_t *stp;
2596 polldat_t *pdp;
2597 fpollinfo_t *fpip2;
2599 if ((stp = vp->v_stream) == NULL) {
2600 return;
2602 PH_ENTER(&stp->sd_pollist);
2603 for (pdp = stp->sd_pollist.ph_list; pdp; pdp = pdp->pd_next) {
2604 if (pdp->pd_thread != NULL &&
2605 pdp->pd_thread->t_procp == curthread->t_procp) {
2606 for (fpip2 = fpip; fpip2; fpip2 = fpip2->fp_next) {
2607 if (pdp->pd_thread == fpip2->fp_thread) {
2608 break;
2611 ASSERT(fpip2 != NULL);
2614 PH_EXIT(&stp->sd_pollist);
2618 * For each cached fd whose bit is not set in bitmap, its revents field in
2619 * current poll list should be 0.
2621 static int
2622 pollcheckrevents(pollstate_t *ps, int begin, int end, int cacheindex)
2624 pollcache_t *pcp = ps->ps_pcache;
2625 pollfd_t *pollfdp = ps->ps_pollfd;
2626 int i;
2628 for (i = begin; i < end; i++) {
2629 polldat_t *pdp;
2631 ASSERT(!BT_TEST(pcp->pc_bitmap, i));
2632 pdp = pcache_lookup_fd(pcp, i);
2633 if (pdp && pdp->pd_fp != NULL) {
2634 xref_t *refp;
2635 int entry;
2637 ASSERT(pdp->pd_ref != NULL);
2638 refp = &pdp->pd_ref[cacheindex];
2639 if (refp->xf_refcnt == 0) {
2640 continue;
2642 entry = refp->xf_position;
2643 ASSERT(entry >= 0);
2644 ASSERT(pollfdp[entry].revents == 0);
2645 if (refp->xf_refcnt > 1) {
2646 int j;
2648 for (j = entry + 1; j < ps->ps_nfds; j++) {
2649 if (pollfdp[j].fd == i) {
2650 ASSERT(pollfdp[j].revents == 0);
2656 return (1);
2659 #endif /* DEBUG */
2661 pollcache_t *
2662 pcache_alloc()
2664 return (kmem_zalloc(sizeof (pollcache_t), KM_SLEEP));
2667 void
2668 pcache_create(pollcache_t *pcp, nfds_t nfds)
2670 size_t mapsize;
2673 * allocate enough bits for the poll fd list
2675 if ((mapsize = POLLMAPCHUNK) <= nfds) {
2676 mapsize = (nfds + POLLMAPCHUNK - 1) & ~(POLLMAPCHUNK - 1);
2678 pcp->pc_bitmap = kmem_zalloc((mapsize / BT_NBIPUL) * sizeof (ulong_t),
2679 KM_SLEEP);
2680 pcp->pc_mapsize = mapsize;
2682 * The hash size is at least POLLHASHCHUNKSZ. If user polls a large
2683 * number of fd to start with, allocate a bigger hash table (to the
2684 * nearest multiple of POLLHASHCHUNKSZ) because dynamically growing a
2685 * hash table is expensive.
2687 if (nfds < POLLHASHCHUNKSZ) {
2688 pcp->pc_hashsize = POLLHASHCHUNKSZ;
2689 } else {
2690 pcp->pc_hashsize = (nfds + POLLHASHCHUNKSZ - 1) &
2691 ~(POLLHASHCHUNKSZ - 1);
2693 pcp->pc_hash = kmem_zalloc(pcp->pc_hashsize * sizeof (polldat_t *),
2694 KM_SLEEP);
2697 void
2698 pcache_destroy(pollcache_t *pcp)
2700 polldat_t **hashtbl;
2701 int i;
2703 hashtbl = pcp->pc_hash;
2704 for (i = 0; i < pcp->pc_hashsize; i++) {
2705 if (hashtbl[i] != NULL) {
2706 polldat_t *pdp, *pdp2;
2708 pdp = hashtbl[i];
2709 while (pdp != NULL) {
2710 pdp2 = pdp->pd_hashnext;
2711 if (pdp->pd_ref != NULL) {
2712 kmem_free(pdp->pd_ref, sizeof (xref_t) *
2713 pdp->pd_nsets);
2715 kmem_free(pdp, sizeof (polldat_t));
2716 pdp = pdp2;
2717 pcp->pc_fdcount--;
2721 ASSERT(pcp->pc_fdcount == 0);
2722 kmem_free(pcp->pc_hash, sizeof (polldat_t *) * pcp->pc_hashsize);
2723 kmem_free(pcp->pc_bitmap,
2724 sizeof (ulong_t) * (pcp->pc_mapsize/BT_NBIPUL));
2725 mutex_destroy(&pcp->pc_no_exit);
2726 mutex_destroy(&pcp->pc_lock);
2727 cv_destroy(&pcp->pc_cv);
2728 cv_destroy(&pcp->pc_busy_cv);
2729 kmem_free(pcp, sizeof (pollcache_t));
2732 pollcacheset_t *
2733 pcacheset_create(int nsets)
2735 return (kmem_zalloc(sizeof (pollcacheset_t) * nsets, KM_SLEEP));
2738 void
2739 pcacheset_destroy(pollcacheset_t *pcsp, int nsets)
2741 int i;
2743 for (i = 0; i < nsets; i++) {
2744 if (pcsp[i].pcs_pollfd != NULL) {
2745 kmem_free(pcsp[i].pcs_pollfd, pcsp[i].pcs_nfds *
2746 sizeof (pollfd_t));
2749 kmem_free(pcsp, sizeof (pollcacheset_t) * nsets);
2753 * Check each duplicated poll fd in the poll list. It may be necessary to
2754 * VOP_POLL the same fd again using different poll events. getf() has been
2755 * done by caller. This routine returns 0 if it can sucessfully process the
2756 * entire poll fd list. It returns -1 if underlying vnode has changed during
2757 * a VOP_POLL, in which case the caller has to repoll. It returns a positive
2758 * value if VOP_POLL failed.
2760 static int
2761 plist_chkdupfd(file_t *fp, polldat_t *pdp, pollstate_t *psp, pollfd_t *pollfdp,
2762 int entry, int *fdcntp)
2764 int i;
2765 int fd;
2766 nfds_t nfds = psp->ps_nfds;
2768 fd = pollfdp[entry].fd;
2769 for (i = entry + 1; i < nfds; i++) {
2770 if (pollfdp[i].fd == fd) {
2771 if (pollfdp[i].events == pollfdp[entry].events) {
2772 if ((pollfdp[i].revents =
2773 pollfdp[entry].revents) != 0) {
2774 (*fdcntp)++;
2776 } else {
2778 int error;
2779 pollhead_t *php;
2780 pollcache_t *pcp = psp->ps_pcache;
2783 * the events are different. VOP_POLL on this
2784 * fd so that we don't miss any revents.
2786 php = NULL;
2787 ASSERT(curthread->t_pollcache == NULL);
2788 error = VOP_POLL(fp->f_vnode,
2789 pollfdp[i].events, 0,
2790 &pollfdp[i].revents, &php, NULL);
2791 if (error) {
2792 return (error);
2795 * layered devices(e.g. console driver)
2796 * may change the vnode and thus the pollhead
2797 * pointer out from underneath us.
2799 if (php != NULL && pdp->pd_php != NULL &&
2800 php != pdp->pd_php) {
2801 pollhead_delete(pdp->pd_php, pdp);
2802 pdp->pd_php = php;
2803 pollhead_insert(php, pdp);
2805 * We could have missed a wakeup on the
2806 * new target device. Make sure the new
2807 * target gets polled once.
2809 BT_SET(pcp->pc_bitmap, fd);
2810 return (-1);
2812 if (pollfdp[i].revents) {
2813 (*fdcntp)++;
2818 return (0);