4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
27 * Copyright (c) 2012 by Delphix. All rights reserved.
28 * Copyright 2017 Joyent, Inc.
31 #include <sys/types.h>
32 #include <sys/devops.h>
34 #include <sys/modctl.h>
35 #include <sys/sunddi.h>
37 #include <sys/poll_impl.h>
38 #include <sys/errno.h>
40 #include <sys/mkdev.h>
41 #include <sys/debug.h>
43 #include <sys/sysmacros.h>
44 #include <sys/systm.h>
45 #include <sys/bitmap.h>
46 #include <sys/devpoll.h>
48 #include <sys/resource.h>
49 #include <sys/schedctl.h>
50 #include <sys/epoll.h>
54 /* local data struct */
55 static dp_entry_t
**devpolltbl
; /* dev poll entries */
56 static size_t dptblsize
;
58 static kmutex_t devpoll_lock
; /* lock protecting dev tbl */
59 int devpoll_init
; /* is /dev/poll initialized already */
61 /* device local functions */
63 static int dpopen(dev_t
*devp
, int flag
, int otyp
, cred_t
*credp
);
64 static int dpwrite(dev_t dev
, struct uio
*uiop
, cred_t
*credp
);
65 static int dpioctl(dev_t dev
, int cmd
, intptr_t arg
, int mode
, cred_t
*credp
,
67 static int dppoll(dev_t dev
, short events
, int anyyet
, short *reventsp
,
68 struct pollhead
**phpp
);
69 static int dpclose(dev_t dev
, int flag
, int otyp
, cred_t
*credp
);
70 static dev_info_t
*dpdevi
;
73 static struct cb_ops dp_cb_ops
= {
86 ddi_prop_op
, /* prop_op */
87 (struct streamtab
*)0, /* streamtab */
89 CB_REV
, /* cb_ops revision */
94 static int dpattach(dev_info_t
*, ddi_attach_cmd_t
);
95 static int dpdetach(dev_info_t
*, ddi_detach_cmd_t
);
96 static int dpinfo(dev_info_t
*, ddi_info_cmd_t
, void *, void **);
98 static struct dev_ops dp_ops
= {
99 DEVO_REV
, /* devo_rev */
102 nulldev
, /* identify */
104 dpattach
, /* attach */
105 dpdetach
, /* detach */
107 &dp_cb_ops
, /* driver operations */
108 (struct bus_ops
*)NULL
, /* bus operations */
110 ddi_quiesce_not_needed
, /* quiesce */
114 static struct modldrv modldrv
= {
115 &mod_driverops
, /* type of module - a driver */
120 static struct modlinkage modlinkage
= {
126 static void pcachelink_assoc(pollcache_t
*, pollcache_t
*);
127 static void pcachelink_mark_stale(pollcache_t
*);
128 static void pcachelink_purge_stale(pollcache_t
*);
129 static void pcachelink_purge_all(pollcache_t
*);
135 * The /dev/poll driver shares most of its code with poll sys call whose
136 * code is in common/syscall/poll.c. In poll(2) design, the pollcache
137 * structure is per lwp. An implicit assumption is made there that some
138 * portion of pollcache will never be touched by other lwps. E.g., in
139 * poll(2) design, no lwp will ever need to grow bitmap of other lwp.
140 * This assumption is not true for /dev/poll; hence the need for extra
143 * To allow more parallelism, each /dev/poll file descriptor (indexed by
144 * minor number) has its own lock. Since read (dpioctl) is a much more
145 * frequent operation than write, we want to allow multiple reads on same
146 * /dev/poll fd. However, we prevent writes from being starved by giving
147 * priority to write operation. Theoretically writes can starve reads as
148 * well. But in practical sense this is not important because (1) writes
149 * happens less often than reads, and (2) write operation defines the
150 * content of poll fd a cache set. If writes happens so often that they
151 * can starve reads, that means the cached set is very unstable. It may
152 * not make sense to read an unstable cache set anyway. Therefore, the
153 * writers starving readers case is not handled in this design.
161 dptblsize
= DEVPOLLSIZE
;
162 devpolltbl
= kmem_zalloc(sizeof (caddr_t
) * dptblsize
, KM_SLEEP
);
163 mutex_init(&devpoll_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
165 if ((error
= mod_install(&modlinkage
)) != 0) {
166 kmem_free(devpolltbl
, sizeof (caddr_t
) * dptblsize
);
177 if ((error
= mod_remove(&modlinkage
)) != 0) {
180 mutex_destroy(&devpoll_lock
);
181 kmem_free(devpolltbl
, sizeof (caddr_t
) * dptblsize
);
186 _info(struct modinfo
*modinfop
)
188 return (mod_info(&modlinkage
, modinfop
));
193 dpattach(dev_info_t
*devi
, ddi_attach_cmd_t cmd
)
195 if (ddi_create_minor_node(devi
, "poll", S_IFCHR
, 0, DDI_PSEUDO
, NULL
)
197 ddi_remove_minor_node(devi
, NULL
);
198 return (DDI_FAILURE
);
201 return (DDI_SUCCESS
);
205 dpdetach(dev_info_t
*devi
, ddi_detach_cmd_t cmd
)
207 if (cmd
!= DDI_DETACH
)
208 return (DDI_FAILURE
);
210 ddi_remove_minor_node(devi
, NULL
);
211 return (DDI_SUCCESS
);
216 dpinfo(dev_info_t
*dip
, ddi_info_cmd_t infocmd
, void *arg
, void **result
)
221 case DDI_INFO_DEVT2DEVINFO
:
222 *result
= (void *)dpdevi
;
225 case DDI_INFO_DEVT2INSTANCE
:
236 * dp_pcache_poll has similar logic to pcache_poll() in poll.c. The major
237 * differences are: (1) /dev/poll requires scanning the bitmap starting at
238 * where it was stopped last time, instead of always starting from 0,
239 * (2) since user may not have cleaned up the cached fds when they are
240 * closed, some polldats in cache may refer to closed or reused fds. We
241 * need to check for those cases.
243 * NOTE: Upon closing an fd, automatic poll cache cleanup is done for
244 * poll(2) caches but NOT for /dev/poll caches. So expect some
248 dp_pcache_poll(dp_entry_t
*dpep
, void *dpbuf
,
249 pollcache_t
*pcp
, nfds_t nfds
, int *fdcntp
)
251 int start
, ostart
, end
;
260 epoll_event_t
*epoll
;
262 short mask
= POLLRDHUP
| POLLWRBAND
;
263 boolean_t is_epoll
= (dpep
->dpe_flag
& DP_ISEPOLLCOMPAT
) != 0;
265 ASSERT(MUTEX_HELD(&pcp
->pc_lock
));
266 if (pcp
->pc_bitmap
== NULL
) {
268 * No Need to search because no poll fd
276 epoll
= (epoll_event_t
*)dpbuf
;
278 pfdp
= (pollfd_t
*)dpbuf
;
282 start
= ostart
= pcp
->pc_mapstart
;
283 end
= pcp
->pc_mapend
;
288 * started from every begining, no need to wrap around.
296 while ((fdcnt
< nfds
) && !done
) {
300 * Examine the bit map in a circular fashion
301 * to avoid starvation. Always resume from
302 * last stop. Scan till end of the map. Then
305 fd
= bt_getlowbit(pcp
->pc_bitmap
, start
, end
);
319 pdp
= pcache_lookup_fd(pcp
, fd
);
322 ASSERT(pdp
->pd_fd
== fd
);
323 if (pdp
->pd_fp
== NULL
) {
325 * The fd is POLLREMOVed. This fd is
326 * logically no longer cached. So move
327 * on to the next one.
331 if ((fp
= getf(fd
)) == NULL
) {
333 * The fd has been closed, but user has not
334 * done a POLLREMOVE on this fd yet. Instead
335 * of cleaning it here implicitly, we return
336 * POLLNVAL. This is consistent with poll(2)
337 * polling a closed fd. Hope this will remind
338 * user to do a POLLREMOVE.
340 if (!is_epoll
&& pfdp
!= NULL
) {
342 pfdp
[fdcnt
].revents
= POLLNVAL
;
348 * In the epoll compatibility case, we actually
349 * perform the implicit removal to remain
350 * closer to the epoll semantics.
356 if (pdp
->pd_php
!= NULL
) {
357 pollhead_delete(pdp
->pd_php
,
362 BT_CLEAR(pcp
->pc_bitmap
, fd
);
367 if (fp
!= pdp
->pd_fp
) {
369 * The user is polling on a cached fd which was
370 * closed and then reused. Unfortunately there
371 * is no good way to communicate this fact to
374 * If the file struct is also reused, we may
375 * not be able to detect the fd reuse at all.
376 * As long as this does not cause system
377 * failure and/or memory leaks, we will play
378 * along. The man page states that if the user
379 * does not clean up closed fds, polling
380 * results will be indeterministic.
382 * XXX: perhaps log the detection of fd reuse?
387 * When this situation has been detected, it's
388 * likely that any existing pollhead is
389 * ill-suited to perform proper wake-ups.
391 * Clean up the old entry under the expectation
392 * that a valid one will be provided as part of
393 * the later VOP_POLL.
395 if (pdp
->pd_php
!= NULL
) {
396 pollhead_delete(pdp
->pd_php
, pdp
);
401 * XXX - pollrelock() logic needs to know which
402 * which pollcache lock to grab. It'd be a
403 * cleaner solution if we could pass pcp as
404 * an arguement in VOP_POLL interface instead
405 * of implicitly passing it using thread_t
406 * struct. On the other hand, changing VOP_POLL
407 * interface will require all driver/file system
408 * poll routine to change. May want to revisit
409 * the tradeoff later.
411 curthread
->t_pollcache
= pcp
;
412 error
= VOP_POLL(fp
->f_vnode
, pdp
->pd_events
, 0,
413 &revent
, &php
, NULL
);
416 * Recheck edge-triggered descriptors which lack a
417 * pollhead. While this check is performed when an fd
418 * is added to the pollcache in dpwrite(), subsequent
419 * descriptor manipulation could cause a different
420 * resource to be present now.
422 if ((pdp
->pd_events
& POLLET
) && error
== 0 &&
423 pdp
->pd_php
== NULL
&& php
== NULL
&& revent
!= 0) {
427 * The same POLLET-only VOP_POLL is used in an
428 * attempt to coax a pollhead from older
431 error
= VOP_POLL(fp
->f_vnode
, POLLET
,
432 0, &levent
, &php
, NULL
);
435 curthread
->t_pollcache
= NULL
;
442 * layered devices (e.g. console driver)
443 * may change the vnode and thus the pollhead
444 * pointer out from underneath us.
446 if (php
!= NULL
&& pdp
->pd_php
!= NULL
&&
447 php
!= pdp
->pd_php
) {
448 pollhead_delete(pdp
->pd_php
, pdp
);
450 pollhead_insert(php
, pdp
);
452 * The bit should still be set.
454 ASSERT(BT_TEST(pcp
->pc_bitmap
, fd
));
461 pfdp
[fdcnt
].events
= pdp
->pd_events
;
462 pfdp
[fdcnt
].revents
= revent
;
463 } else if (epoll
!= NULL
) {
464 epoll_event_t
*ep
= &epoll
[fdcnt
];
466 ASSERT(epoll
!= NULL
);
467 ep
->data
.u64
= pdp
->pd_epolldata
;
470 * Since POLLNVAL is a legal event for
471 * VOP_POLL handlers to emit, it must
472 * be translated epoll-legal.
474 if (revent
& POLLNVAL
) {
480 * If any of the event bits are set for
481 * which poll and epoll representations
482 * differ, swizzle in the native epoll
486 ep
->events
= (revent
& ~mask
) |
487 ((revent
& POLLRDHUP
) ?
489 ((revent
& POLLWRBAND
) ?
496 * We define POLLWRNORM to be POLLOUT,
497 * but epoll has separate definitions
498 * for them; if POLLOUT is set and the
499 * user has asked for EPOLLWRNORM, set
502 if ((revent
& POLLOUT
) &&
503 (pdp
->pd_events
& EPOLLWRNORM
)) {
504 ep
->events
|= EPOLLWRNORM
;
508 curthread
->t_pollstate
;
510 * The devpoll handle itself is being
511 * polled. Notify the caller of any
512 * readable event(s), leaving as much
513 * state as possible untouched.
519 * If a call to pollunlock() fails
520 * during VOP_POLL, skip over the fd
521 * and continue polling.
523 * Otherwise, report that there is an
526 if ((ps
->ps_flags
& POLLSTATE_ULFAIL
)
537 /* Handle special polling modes. */
538 if (pdp
->pd_events
& POLLONESHOT
) {
540 * If POLLONESHOT is set, perform the
541 * implicit POLLREMOVE.
546 if (pdp
->pd_php
!= NULL
) {
547 pollhead_delete(pdp
->pd_php
,
552 BT_CLEAR(pcp
->pc_bitmap
, fd
);
553 } else if (pdp
->pd_events
& POLLET
) {
555 * Wire up the pollhead which should
556 * have been provided. Edge-triggered
557 * polling cannot function properly
558 * with drivers which do not emit one.
561 pdp
->pd_php
== NULL
) {
562 pollhead_insert(php
, pdp
);
567 * If the driver has emitted a pollhead,
568 * clear the bit in the bitmap which
569 * effectively latches the edge on a
570 * pollwakeup() from the driver.
572 if (pdp
->pd_php
!= NULL
) {
573 BT_CLEAR(pcp
->pc_bitmap
, fd
);
578 } else if (php
!= NULL
) {
580 * We clear a bit or cache a poll fd if
581 * the driver returns a poll head ptr,
582 * which is expected in the case of 0
583 * revents. Some buggy driver may return
584 * NULL php pointer with 0 revents. In
585 * this case, we just treat the driver as
586 * "noncachable" and not clearing the bit
589 if ((pdp
->pd_php
!= NULL
) &&
590 ((pcp
->pc_flag
& PC_POLLWAKE
) == 0)) {
591 BT_CLEAR(pcp
->pc_bitmap
, fd
);
593 if (pdp
->pd_php
== NULL
) {
594 pollhead_insert(php
, pdp
);
597 * An event of interest may have
598 * arrived between the VOP_POLL() and
599 * the pollhead_insert(); check again.
606 * No bit set in the range. Check for wrap around.
619 pcp
->pc_mapstart
= start
;
621 ASSERT(*fdcntp
== 0);
628 dpopen(dev_t
*devp
, int flag
, int otyp
, cred_t
*credp
)
634 ASSERT(devpoll_init
);
635 ASSERT(dptblsize
<= MAXMIN
);
636 mutex_enter(&devpoll_lock
);
637 for (minordev
= 0; minordev
< dptblsize
; minordev
++) {
638 if (devpolltbl
[minordev
] == NULL
) {
639 devpolltbl
[minordev
] = (dp_entry_t
*)RESERVED
;
643 if (minordev
== dptblsize
) {
648 * Used up every entry in the existing devpoll table.
649 * Grow the table by DEVPOLLSIZE.
651 if ((oldsize
= dptblsize
) >= MAXMIN
) {
652 mutex_exit(&devpoll_lock
);
655 dptblsize
+= DEVPOLLSIZE
;
656 if (dptblsize
> MAXMIN
) {
659 newtbl
= kmem_zalloc(sizeof (caddr_t
) * dptblsize
, KM_SLEEP
);
660 bcopy(devpolltbl
, newtbl
, sizeof (caddr_t
) * oldsize
);
661 kmem_free(devpolltbl
, sizeof (caddr_t
) * oldsize
);
663 devpolltbl
[minordev
] = (dp_entry_t
*)RESERVED
;
665 mutex_exit(&devpoll_lock
);
667 dpep
= kmem_zalloc(sizeof (dp_entry_t
), KM_SLEEP
);
669 * allocate a pollcache skeleton here. Delay allocating bitmap
670 * structures until dpwrite() time, since we don't know the
671 * optimal size yet. We also delay setting the pid until either
672 * dpwrite() or attempt to poll on the instance, allowing parents
673 * to create instances of /dev/poll for their children. (In the
674 * epoll compatibility case, this check isn't performed to maintain
675 * semantic compatibility.)
677 pcp
= pcache_alloc();
678 dpep
->dpe_pcache
= pcp
;
680 *devp
= makedevice(getmajor(*devp
), minordev
); /* clone the driver */
681 mutex_enter(&devpoll_lock
);
682 ASSERT(minordev
< dptblsize
);
683 ASSERT(devpolltbl
[minordev
] == (dp_entry_t
*)RESERVED
);
684 devpolltbl
[minordev
] = dpep
;
685 mutex_exit(&devpoll_lock
);
690 * Write to dev/poll add/remove fd's to/from a cached poll fd set,
691 * or change poll events for a watched fd.
695 dpwrite(dev_t dev
, struct uio
*uiop
, cred_t
*credp
)
700 pollfd_t
*pollfdp
, *pfdp
;
701 dvpoll_epollfd_t
*epfdp
;
707 struct pollhead
*php
= NULL
;
711 boolean_t is_epoll
, fds_added
= B_FALSE
;
713 minor
= getminor(dev
);
715 mutex_enter(&devpoll_lock
);
716 ASSERT(minor
< dptblsize
);
717 dpep
= devpolltbl
[minor
];
718 ASSERT(dpep
!= NULL
);
719 mutex_exit(&devpoll_lock
);
721 mutex_enter(&dpep
->dpe_lock
);
722 pcp
= dpep
->dpe_pcache
;
723 is_epoll
= (dpep
->dpe_flag
& DP_ISEPOLLCOMPAT
) != 0;
724 size
= (is_epoll
) ? sizeof (dvpoll_epollfd_t
) : sizeof (pollfd_t
);
725 mutex_exit(&dpep
->dpe_lock
);
727 if (!is_epoll
&& curproc
->p_pid
!= pcp
->pc_pid
) {
728 if (pcp
->pc_pid
!= -1) {
732 pcp
->pc_pid
= curproc
->p_pid
;
735 uiosize
= uiop
->uio_resid
;
736 pollfdnum
= uiosize
/ size
;
739 * We want to make sure that pollfdnum isn't large enough to DoS us,
740 * but we also don't want to grab p_lock unnecessarily -- so we
741 * perform the full check against our resource limits if and only if
742 * pollfdnum is larger than the known-to-be-sane value of UINT8_MAX.
744 if (pollfdnum
> UINT8_MAX
) {
745 mutex_enter(&curproc
->p_lock
);
747 (uint_t
)rctl_enforced_value(rctlproc_legacy
[RLIMIT_NOFILE
],
748 curproc
->p_rctls
, curproc
)) {
749 (void) rctl_action(rctlproc_legacy
[RLIMIT_NOFILE
],
750 curproc
->p_rctls
, curproc
, RCA_SAFE
);
751 mutex_exit(&curproc
->p_lock
);
754 mutex_exit(&curproc
->p_lock
);
758 * Copy in the pollfd array. Walk through the array and add
759 * each polled fd to the cached set.
761 pollfdp
= kmem_alloc(uiosize
, KM_SLEEP
);
762 limit
= (uintptr_t)pollfdp
+ (pollfdnum
* size
);
765 * Although /dev/poll uses the write(2) interface to cache fds, it's
766 * not supposed to function as a seekable device. To prevent offset
767 * from growing and eventually exceed the maximum, reset the offset
768 * here for every call.
770 uiop
->uio_loffset
= 0;
773 * Use uiocopy instead of uiomove when populating pollfdp, keeping
774 * uio_resid untouched for now. Write syscalls will translate EINTR
775 * into a success if they detect "successfully transfered" data via an
776 * updated uio_resid. Falsely suppressing such errors is disastrous.
778 if ((error
= uiocopy((caddr_t
)pollfdp
, uiosize
, UIO_WRITE
, uiop
,
780 kmem_free(pollfdp
, uiosize
);
785 * We are about to enter the core portion of dpwrite(). Make sure this
786 * write has exclusive access in this portion of the code, i.e., no
787 * other writers in this code.
789 * Waiting for all readers to drop their references to the dpe is
790 * unecessary since the pollcache itself is protected by pc_lock.
792 mutex_enter(&dpep
->dpe_lock
);
793 dpep
->dpe_writerwait
++;
794 while ((dpep
->dpe_flag
& DP_WRITER_PRESENT
) != 0) {
795 ASSERT(dpep
->dpe_refcnt
!= 0);
797 if (!cv_wait_sig_swap(&dpep
->dpe_cv
, &dpep
->dpe_lock
)) {
798 dpep
->dpe_writerwait
--;
799 mutex_exit(&dpep
->dpe_lock
);
800 kmem_free(pollfdp
, uiosize
);
804 dpep
->dpe_writerwait
--;
805 dpep
->dpe_flag
|= DP_WRITER_PRESENT
;
808 if (!is_epoll
&& (dpep
->dpe_flag
& DP_ISEPOLLCOMPAT
) != 0) {
810 * The epoll compat mode was enabled while we were waiting to
811 * establish write access. It is not safe to continue since
812 * state was prepared for non-epoll operation.
817 mutex_exit(&dpep
->dpe_lock
);
820 * Since the dpwrite() may recursively walk an added /dev/poll handle,
821 * pollstate_enter() deadlock and loop detection must be used.
823 (void) pollstate_create();
824 VERIFY(pollstate_enter(pcp
) == PSE_SUCCESS
);
826 if (pcp
->pc_bitmap
== NULL
) {
827 pcache_create(pcp
, pollfdnum
);
829 for (pfdp
= pollfdp
; (uintptr_t)pfdp
< limit
;
830 pfdp
= (pollfd_t
*)((uintptr_t)pfdp
+ size
)) {
832 if ((uint_t
)fd
>= P_FINFO(curproc
)->fi_nfiles
) {
834 * epoll semantics demand that we return EBADF if our
835 * specified fd is invalid.
845 pdp
= pcache_lookup_fd(pcp
, fd
);
846 if (pfdp
->events
!= POLLREMOVE
) {
852 * If we're in epoll compatibility mode, check
853 * that the fd is valid before allocating
854 * anything for it; epoll semantics demand that
855 * we return EBADF if our specified fd is
859 if ((fp
= getf(fd
)) == NULL
) {
865 pdp
= pcache_alloc_fd(0);
867 pdp
->pd_pcache
= pcp
;
868 pcache_insert_fd(pcp
, pdp
, pollfdnum
);
871 * epoll semantics demand that we error out if
872 * a file descriptor is added twice, which we
873 * check (imperfectly) by checking if we both
874 * have the file descriptor cached and the
875 * file pointer that correponds to the file
876 * descriptor matches our cached value. If
877 * there is a pointer mismatch, the file
878 * descriptor was closed without being removed.
879 * The converse is clearly not true, however,
880 * so to narrow the window by which a spurious
881 * EEXIST may be returned, we also check if
882 * this fp has been added to an epoll control
883 * descriptor in the past; if it hasn't, we
884 * know that this is due to fp reuse -- it's
885 * not a true EEXIST case. (By performing this
886 * additional check, we limit the window of
887 * spurious EEXIST to situations where a single
888 * file descriptor is being used across two or
889 * more epoll control descriptors -- and even
890 * then, the file descriptor must be closed and
891 * reused in a relatively tight time span.)
894 if (pdp
->pd_fp
!= NULL
&&
895 (fp
= getf(fd
)) != NULL
&&
897 (fp
->f_flag2
& FEPOLLED
)) {
904 * We have decided that the cached
905 * information was stale: it either
906 * didn't match, or the fp had never
907 * actually been epoll()'d on before.
908 * We need to now clear our pd_events
909 * to assure that we don't mistakenly
910 * operate on cached event disposition.
917 epfdp
= (dvpoll_epollfd_t
*)pfdp
;
918 pdp
->pd_epolldata
= epfdp
->dpep_data
;
921 ASSERT(pdp
->pd_fd
== fd
);
922 ASSERT(pdp
->pd_pcache
== pcp
);
923 if (fd
>= pcp
->pc_mapsize
) {
924 mutex_exit(&pcp
->pc_lock
);
925 pcache_grow_map(pcp
, fd
);
926 mutex_enter(&pcp
->pc_lock
);
928 if (fd
> pcp
->pc_mapend
) {
931 if (fp
== NULL
&& (fp
= getf(fd
)) == NULL
) {
933 * The fd is not valid. Since we can't pass
934 * this error back in the write() call, set
935 * the bit in bitmap to force DP_POLL ioctl
938 BT_SET(pcp
->pc_bitmap
, fd
);
939 pdp
->pd_events
|= pfdp
->events
;
944 * To (greatly) reduce EEXIST false positives, we
945 * denote that this fp has been epoll()'d. We do this
946 * regardless of epoll compatibility mode, as the flag
947 * is harmless if not in epoll compatibility mode.
949 fp
->f_flag2
|= FEPOLLED
;
952 * Don't do VOP_POLL for an already cached fd with
955 if ((pdp
->pd_events
== pfdp
->events
) &&
956 (pdp
->pd_fp
== fp
)) {
958 * the events are already cached
965 * do VOP_POLL and cache this poll fd.
968 * XXX - pollrelock() logic needs to know which
969 * which pollcache lock to grab. It'd be a
970 * cleaner solution if we could pass pcp as
971 * an arguement in VOP_POLL interface instead
972 * of implicitly passing it using thread_t
973 * struct. On the other hand, changing VOP_POLL
974 * interface will require all driver/file system
975 * poll routine to change. May want to revisit
976 * the tradeoff later.
978 curthread
->t_pollcache
= pcp
;
979 error
= VOP_POLL(fp
->f_vnode
, pfdp
->events
, 0,
980 &pfdp
->revents
, &php
, NULL
);
983 * Edge-triggered polling requires a pollhead in order
984 * to initiate wake-ups properly. Drivers which are
985 * savvy to POLLET presence, which should include
986 * everything in-gate, will always emit one, regardless
987 * of revent status. Older drivers which only emit a
988 * pollhead if 'revents == 0' are given a second chance
989 * here via a second VOP_POLL, with only POLLET set in
990 * the events of interest. These circumstances should
991 * induce any cacheable drivers to emit a pollhead for
994 * Drivers which never emit a pollhead will simply
995 * disobey the exectation of edge-triggered behavior.
996 * This includes recursive epoll which, even on Linux,
997 * yields its events in a level-triggered fashion only.
999 if ((pdp
->pd_events
& POLLET
) && error
== 0 &&
1003 error
= VOP_POLL(fp
->f_vnode
, POLLET
, 0,
1004 &levent
, &php
, NULL
);
1007 curthread
->t_pollcache
= NULL
;
1009 * We always set the bit when this fd is cached;
1010 * this forces the first DP_POLL to poll this fd.
1011 * Real performance gain comes from subsequent
1012 * DP_POLL. We also attempt a pollhead_insert();
1013 * if it's not possible, we'll do it in dpioctl().
1015 BT_SET(pcp
->pc_bitmap
, fd
);
1021 pdp
->pd_events
|= pfdp
->events
;
1023 if (pdp
->pd_php
== NULL
) {
1024 pollhead_insert(php
, pdp
);
1027 if (pdp
->pd_php
!= php
) {
1028 pollhead_delete(pdp
->pd_php
,
1030 pollhead_insert(php
, pdp
);
1038 if (pdp
== NULL
|| pdp
->pd_fp
== NULL
) {
1041 * As with the add case (above), epoll
1042 * semantics demand that we error out
1051 ASSERT(pdp
->pd_fd
== fd
);
1054 ASSERT(pdp
->pd_thread
== NULL
);
1055 if (pdp
->pd_php
!= NULL
) {
1056 pollhead_delete(pdp
->pd_php
, pdp
);
1059 BT_CLEAR(pcp
->pc_bitmap
, fd
);
1063 * Wake any pollcache waiters so they can check the new descriptors.
1065 * Any fds added to an recursive-capable pollcache could themselves be
1066 * /dev/poll handles. To ensure that proper event propagation occurs,
1067 * parent pollcaches are woken too, so that they can create any needed
1071 cv_broadcast(&pcp
->pc_cv
);
1072 pcache_wake_parents(pcp
);
1074 pollstate_exit(pcp
);
1075 mutex_enter(&dpep
->dpe_lock
);
1077 dpep
->dpe_flag
&= ~DP_WRITER_PRESENT
;
1079 cv_broadcast(&dpep
->dpe_cv
);
1080 mutex_exit(&dpep
->dpe_lock
);
1081 kmem_free(pollfdp
, uiosize
);
1084 * The state of uio_resid is updated only after the pollcache
1085 * is successfully modified.
1087 uioskip(uiop
, copysize
);
1092 #define DP_SIGMASK_RESTORE(ksetp) { \
1093 if (ksetp != NULL) { \
1094 mutex_enter(&p->p_lock); \
1095 if (lwp->lwp_cursig == 0) { \
1096 t->t_hold = lwp->lwp_sigoldmask; \
1097 t->t_flag &= ~T_TOMASK; \
1099 mutex_exit(&p->p_lock); \
1105 dpioctl(dev_t dev
, int cmd
, intptr_t arg
, int mode
, cred_t
*credp
, int *rvalp
)
1113 STRUCT_DECL(dvpoll
, dvpoll
);
1115 if (cmd
== DP_POLL
|| cmd
== DP_PPOLL
) {
1116 /* do this now, before we sleep on DP_WRITER_PRESENT */
1120 minor
= getminor(dev
);
1121 mutex_enter(&devpoll_lock
);
1122 ASSERT(minor
< dptblsize
);
1123 dpep
= devpolltbl
[minor
];
1124 mutex_exit(&devpoll_lock
);
1125 ASSERT(dpep
!= NULL
);
1126 pcp
= dpep
->dpe_pcache
;
1128 mutex_enter(&dpep
->dpe_lock
);
1129 is_epoll
= (dpep
->dpe_flag
& DP_ISEPOLLCOMPAT
) != 0;
1131 if (cmd
== DP_EPOLLCOMPAT
) {
1132 if (dpep
->dpe_refcnt
!= 0) {
1134 * We can't turn on epoll compatibility while there
1135 * are outstanding operations.
1137 mutex_exit(&dpep
->dpe_lock
);
1142 * epoll compatibility is a one-way street: there's no way
1143 * to turn it off for a particular open.
1145 dpep
->dpe_flag
|= DP_ISEPOLLCOMPAT
;
1146 mutex_exit(&dpep
->dpe_lock
);
1151 if (!is_epoll
&& curproc
->p_pid
!= pcp
->pc_pid
) {
1152 if (pcp
->pc_pid
!= -1) {
1153 mutex_exit(&dpep
->dpe_lock
);
1157 pcp
->pc_pid
= curproc
->p_pid
;
1160 /* Wait until all writers have cleared the handle before continuing */
1161 while ((dpep
->dpe_flag
& DP_WRITER_PRESENT
) != 0 ||
1162 (dpep
->dpe_writerwait
!= 0)) {
1163 if (!cv_wait_sig_swap(&dpep
->dpe_cv
, &dpep
->dpe_lock
)) {
1164 mutex_exit(&dpep
->dpe_lock
);
1169 mutex_exit(&dpep
->dpe_lock
);
1178 size_t size
, fdsize
, dpsize
;
1179 hrtime_t deadline
= 0;
1180 k_sigset_t
*ksetp
= NULL
;
1183 kthread_t
*t
= curthread
;
1184 klwp_t
*lwp
= ttolwp(t
);
1185 struct proc
*p
= ttoproc(curthread
);
1187 STRUCT_INIT(dvpoll
, mode
);
1190 * The dp_setp member is only required/consumed for DP_PPOLL,
1191 * which otherwise uses the same structure as DP_POLL.
1193 if (cmd
== DP_POLL
) {
1194 dpsize
= (uintptr_t)STRUCT_FADDR(dvpoll
, dp_setp
) -
1195 (uintptr_t)STRUCT_FADDR(dvpoll
, dp_fds
);
1197 ASSERT(cmd
== DP_PPOLL
);
1198 dpsize
= STRUCT_SIZE(dvpoll
);
1201 if ((mode
& FKIOCTL
) != 0) {
1202 /* Kernel-internal ioctl call */
1203 bcopy((caddr_t
)arg
, STRUCT_BUF(dvpoll
), dpsize
);
1206 error
= copyin((caddr_t
)arg
, STRUCT_BUF(dvpoll
),
1215 deadline
= STRUCT_FGET(dvpoll
, dp_timeout
);
1218 * Convert the deadline from relative milliseconds
1219 * to absolute nanoseconds. They must wait for at
1222 deadline
= MSEC2NSEC(deadline
);
1223 deadline
= MAX(deadline
, nsec_per_tick
);
1227 if (cmd
== DP_PPOLL
) {
1228 void *setp
= STRUCT_FGETP(dvpoll
, dp_setp
);
1231 if ((mode
& FKIOCTL
) != 0) {
1232 /* Use the signal set directly */
1233 ksetp
= (k_sigset_t
*)setp
;
1235 if (copyin(setp
, &set
, sizeof (set
))) {
1239 sigutok(&set
, &kset
);
1243 mutex_enter(&p
->p_lock
);
1244 schedctl_finish_sigblock(t
);
1245 lwp
->lwp_sigoldmask
= t
->t_hold
;
1247 t
->t_flag
|= T_TOMASK
;
1250 * Like ppoll() with a non-NULL sigset, we'll
1251 * call cv_reltimedwait_sig() just to check for
1252 * signals. This call will return immediately
1253 * with either 0 (signalled) or -1 (no signal).
1254 * There are some conditions whereby we can
1255 * get 0 from cv_reltimedwait_sig() without
1256 * a true signal (e.g., a directed stop), so
1257 * we restore our signal mask in the unlikely
1258 * event that lwp_cursig is 0.
1260 if (!cv_reltimedwait_sig(&t
->t_delay_cv
,
1261 &p
->p_lock
, 0, TR_CLOCK_TICK
)) {
1262 if (lwp
->lwp_cursig
== 0) {
1263 t
->t_hold
= lwp
->lwp_sigoldmask
;
1264 t
->t_flag
&= ~T_TOMASK
;
1267 mutex_exit(&p
->p_lock
);
1273 mutex_exit(&p
->p_lock
);
1277 if ((nfds
= STRUCT_FGET(dvpoll
, dp_nfds
)) == 0) {
1279 * We are just using DP_POLL to sleep, so
1280 * we don't any of the devpoll apparatus.
1281 * Do not check for signals if we have a zero timeout.
1284 if (deadline
== 0) {
1285 DP_SIGMASK_RESTORE(ksetp
);
1289 mutex_enter(&curthread
->t_delay_lock
);
1291 cv_timedwait_sig_hrtime(&curthread
->t_delay_cv
,
1292 &curthread
->t_delay_lock
, deadline
)) > 0)
1294 mutex_exit(&curthread
->t_delay_lock
);
1296 DP_SIGMASK_RESTORE(ksetp
);
1298 return (error
== 0 ? EINTR
: 0);
1302 size
= nfds
* (fdsize
= sizeof (epoll_event_t
));
1304 size
= nfds
* (fdsize
= sizeof (pollfd_t
));
1308 * XXX It would be nice not to have to alloc each time, but it
1309 * requires another per thread structure hook. This can be
1310 * implemented later if data suggests that it's necessary.
1312 ps
= pollstate_create();
1314 if (ps
->ps_dpbufsize
< size
) {
1316 * If nfds is larger than twice the current maximum
1317 * open file count, we'll silently clamp it. This
1318 * only limits our exposure to allocating an
1319 * inordinate amount of kernel memory; it doesn't
1320 * otherwise affect the semantics. (We have this
1321 * check at twice the maximum instead of merely the
1322 * maximum because some applications pass an nfds that
1323 * is only slightly larger than their limit.)
1325 mutex_enter(&p
->p_lock
);
1326 if ((nfds
>> 1) > p
->p_fno_ctl
) {
1327 nfds
= p
->p_fno_ctl
;
1328 size
= nfds
* fdsize
;
1330 mutex_exit(&p
->p_lock
);
1332 if (ps
->ps_dpbufsize
< size
) {
1333 kmem_free(ps
->ps_dpbuf
, ps
->ps_dpbufsize
);
1334 ps
->ps_dpbuf
= kmem_zalloc(size
, KM_SLEEP
);
1335 ps
->ps_dpbufsize
= size
;
1339 VERIFY(pollstate_enter(pcp
) == PSE_SUCCESS
);
1341 pcp
->pc_flag
&= ~PC_POLLWAKE
;
1344 * Mark all child pcachelinks as stale.
1345 * Those which are still part of the tree will be
1346 * marked as valid during the poll.
1348 pcachelink_mark_stale(pcp
);
1350 error
= dp_pcache_poll(dpep
, ps
->ps_dpbuf
,
1352 if (fdcnt
> 0 || error
!= 0)
1355 /* Purge still-stale child pcachelinks */
1356 pcachelink_purge_stale(pcp
);
1359 * A pollwake has happened since we polled cache.
1361 if (pcp
->pc_flag
& PC_POLLWAKE
)
1365 * Sleep until we are notified, signaled, or timed out.
1367 if (deadline
== 0) {
1368 /* immediate timeout; do not check signals */
1372 error
= cv_timedwait_sig_hrtime(&pcp
->pc_cv
,
1373 &pcp
->pc_lock
, deadline
);
1376 * If we were awakened by a signal or timeout then
1377 * break the loop, else poll again.
1380 error
= (error
== 0) ? EINTR
: 0;
1386 pollstate_exit(pcp
);
1388 DP_SIGMASK_RESTORE(ksetp
);
1390 if (error
== 0 && fdcnt
> 0) {
1392 * It should be noted that FKIOCTL does not influence
1393 * the copyout (vs bcopy) of dp_fds at this time.
1395 if (copyout(ps
->ps_dpbuf
,
1396 STRUCT_FGETP(dvpoll
, dp_fds
), fdcnt
* fdsize
)) {
1410 STRUCT_INIT(dvpoll
, mode
);
1411 error
= copyin((caddr_t
)arg
, &pollfd
, sizeof (pollfd_t
));
1416 mutex_enter(&pcp
->pc_lock
);
1417 if (pcp
->pc_hash
== NULL
) {
1419 * No Need to search because no poll fd
1422 mutex_exit(&pcp
->pc_lock
);
1426 if (pollfd
.fd
< 0) {
1427 mutex_exit(&pcp
->pc_lock
);
1430 pdp
= pcache_lookup_fd(pcp
, pollfd
.fd
);
1431 if ((pdp
!= NULL
) && (pdp
->pd_fd
== pollfd
.fd
) &&
1432 (pdp
->pd_fp
!= NULL
)) {
1433 pollfd
.revents
= pdp
->pd_events
;
1434 if (copyout(&pollfd
, (caddr_t
)arg
, sizeof (pollfd_t
))) {
1435 mutex_exit(&pcp
->pc_lock
);
1441 mutex_exit(&pcp
->pc_lock
);
1454 * Overview of Recursive Polling
1456 * It is possible for /dev/poll to poll for events on file descriptors which
1457 * themselves are /dev/poll handles. Pending events in the child handle are
1458 * represented as readable data via the POLLIN flag. To limit surface area,
1459 * this recursion is presently allowed on only /dev/poll handles which have
1460 * been placed in epoll mode via the DP_EPOLLCOMPAT ioctl. Recursion depth is
1461 * limited to 5 in order to be consistent with Linux epoll.
1463 * Extending dppoll() for VOP_POLL:
1465 * The recursive /dev/poll implementation begins by extending dppoll() to
1466 * report when resources contained in the pollcache have relevant event state.
1467 * At the highest level, it means calling dp_pcache_poll() so it indicates if
1468 * fd events are present without consuming them or altering the pollcache
1469 * bitmap. This ensures that a subsequent DP_POLL operation on the bitmap will
1470 * yield the initiating event. Additionally, the VOP_POLL should return in
1471 * such a way that dp_pcache_poll() does not clear the parent bitmap entry
1472 * which corresponds to the child /dev/poll fd. This means that child
1473 * pollcaches will be checked during every poll which facilitates wake-up
1474 * behavior detailed below.
1476 * Pollcache Links and Wake Events:
1478 * Recursive /dev/poll avoids complicated pollcache locking constraints during
1479 * pollwakeup events by eschewing the traditional pollhead mechanism in favor
1480 * of a different approach. For each pollcache at the root of a recursive
1481 * /dev/poll "tree", pcachelink_t structures are established to all child
1482 * /dev/poll pollcaches. During pollnotify() in a child pollcache, the
1483 * linked list of pcachelink_t entries is walked, where those marked as valid
1484 * incur a cv_broadcast to their parent pollcache. Most notably, these
1485 * pcachelink_t cv wakeups are performed without acquiring pc_lock on the
1486 * parent pollcache (which would require careful deadlock avoidance). This
1487 * still allows the woken poll on the parent to discover the pertinent events
1488 * due to the fact that bitmap entires for the child pollcache are always
1489 * maintained by the dppoll() logic above.
1491 * Depth Limiting and Loop Prevention:
1493 * As each pollcache is encountered (either via DP_POLL or dppoll()), depth and
1494 * loop constraints are enforced via pollstate_enter(). The pollcache_t
1495 * pointer is compared against any existing entries in ps_pc_stack and is added
1496 * to the end if no match (and therefore loop) is found. Once poll operations
1497 * for a given pollcache_t are complete, pollstate_exit() clears the pointer
1498 * from the list. The pollstate_enter() and pollstate_exit() functions are
1499 * responsible for acquiring and releasing pc_lock, respectively.
1503 * Descending through a tree of recursive /dev/poll handles involves the tricky
1504 * business of sequentially entering multiple pollcache locks. This tree
1505 * topology cannot define a lock acquisition order in such a way that it is
1506 * immune to deadlocks between threads. The pollstate_enter() and
1507 * pollstate_exit() functions provide an interface for recursive /dev/poll
1508 * operations to safely lock pollcaches while failing gracefully in the face of
1509 * deadlocking topologies. (See pollstate_contend() for more detail about how
1510 * deadlocks are detected and resolved.)
1515 dppoll(dev_t dev
, short events
, int anyyet
, short *reventsp
,
1516 struct pollhead
**phpp
)
1523 minor
= getminor(dev
);
1524 mutex_enter(&devpoll_lock
);
1525 ASSERT(minor
< dptblsize
);
1526 dpep
= devpolltbl
[minor
];
1527 ASSERT(dpep
!= NULL
);
1528 mutex_exit(&devpoll_lock
);
1530 mutex_enter(&dpep
->dpe_lock
);
1531 if ((dpep
->dpe_flag
& DP_ISEPOLLCOMPAT
) == 0) {
1532 /* Poll recursion is not yet supported for non-epoll handles */
1533 *reventsp
= POLLERR
;
1534 mutex_exit(&dpep
->dpe_lock
);
1538 pcp
= dpep
->dpe_pcache
;
1539 mutex_exit(&dpep
->dpe_lock
);
1542 res
= pollstate_enter(pcp
);
1543 if (res
== PSE_SUCCESS
) {
1546 pollstate_t
*ps
= curthread
->t_pollstate
;
1549 * Recursive polling will only emit certain events. Skip a
1550 * scan of the pollcache if those events are not of interest.
1552 if (events
& (POLLIN
|POLLRDNORM
)) {
1553 rc
= dp_pcache_poll(dpep
, NULL
, pcp
, nfds
, &fdcnt
);
1559 if (rc
== 0 && fdcnt
> 0) {
1560 *reventsp
= POLLIN
|POLLRDNORM
;
1564 pcachelink_assoc(pcp
, ps
->ps_pc_stack
[0]);
1565 pollstate_exit(pcp
);
1568 case PSE_FAIL_DEPTH
:
1572 case PSE_FAIL_DEADLOCK
:
1577 * If anything else has gone awry, such as being polled
1578 * from an unexpected context, fall back to the
1579 * recursion-intolerant response.
1581 *reventsp
= POLLERR
;
1592 * devpoll close should do enough clean up before the pollcache is deleted,
1593 * i.e., it should ensure no one still references the pollcache later.
1594 * There is no "permission" check in here. Any process having the last
1595 * reference of this /dev/poll fd can close.
1599 dpclose(dev_t dev
, int flag
, int otyp
, cred_t
*credp
)
1605 polldat_t
**hashtbl
;
1608 minor
= getminor(dev
);
1610 mutex_enter(&devpoll_lock
);
1611 dpep
= devpolltbl
[minor
];
1612 ASSERT(dpep
!= NULL
);
1613 devpolltbl
[minor
] = NULL
;
1614 mutex_exit(&devpoll_lock
);
1615 pcp
= dpep
->dpe_pcache
;
1616 ASSERT(pcp
!= NULL
);
1618 * At this point, no other lwp can access this pollcache via the
1619 * /dev/poll fd. This pollcache is going away, so do the clean
1620 * up without the pc_lock.
1622 hashtbl
= pcp
->pc_hash
;
1623 for (i
= 0; i
< pcp
->pc_hashsize
; i
++) {
1624 for (pdp
= hashtbl
[i
]; pdp
; pdp
= pdp
->pd_hashnext
) {
1625 if (pdp
->pd_php
!= NULL
) {
1626 pollhead_delete(pdp
->pd_php
, pdp
);
1633 * pollwakeup() may still interact with this pollcache. Wait until
1636 mutex_enter(&pcp
->pc_no_exit
);
1637 ASSERT(pcp
->pc_busy
>= 0);
1638 while (pcp
->pc_busy
> 0)
1639 cv_wait(&pcp
->pc_busy_cv
, &pcp
->pc_no_exit
);
1640 mutex_exit(&pcp
->pc_no_exit
);
1642 /* Clean up any pollcache links created via recursive /dev/poll */
1643 if (pcp
->pc_parents
!= NULL
|| pcp
->pc_children
!= NULL
) {
1645 * Because of the locking rules for pcachelink manipulation,
1646 * acquring pc_lock is required for this step.
1648 mutex_enter(&pcp
->pc_lock
);
1649 pcachelink_purge_all(pcp
);
1650 mutex_exit(&pcp
->pc_lock
);
1653 pcache_destroy(pcp
);
1654 ASSERT(dpep
->dpe_refcnt
== 0);
1655 kmem_free(dpep
, sizeof (dp_entry_t
));
1660 pcachelink_locked_rele(pcachelink_t
*pl
)
1662 ASSERT(MUTEX_HELD(&pl
->pcl_lock
));
1663 VERIFY(pl
->pcl_refcnt
>= 1);
1666 if (pl
->pcl_refcnt
== 0) {
1667 VERIFY(pl
->pcl_state
== PCL_INVALID
);
1668 ASSERT(pl
->pcl_parent_pc
== NULL
);
1669 ASSERT(pl
->pcl_child_pc
== NULL
);
1670 ASSERT(pl
->pcl_parent_next
== NULL
);
1671 ASSERT(pl
->pcl_child_next
== NULL
);
1673 pl
->pcl_state
= PCL_FREE
;
1674 mutex_destroy(&pl
->pcl_lock
);
1675 kmem_free(pl
, sizeof (pcachelink_t
));
1677 mutex_exit(&pl
->pcl_lock
);
1682 * Associate parent and child pollcaches via a pcachelink_t. If an existing
1683 * link (stale or valid) between the two is found, it will be reused. If a
1684 * suitable link is not found for reuse, a new one will be allocated.
1687 pcachelink_assoc(pollcache_t
*child
, pollcache_t
*parent
)
1689 pcachelink_t
*pl
, **plpn
;
1691 ASSERT(MUTEX_HELD(&child
->pc_lock
));
1692 ASSERT(MUTEX_HELD(&parent
->pc_lock
));
1694 /* Search for an existing link we can reuse. */
1695 plpn
= &child
->pc_parents
;
1696 for (pl
= child
->pc_parents
; pl
!= NULL
; pl
= *plpn
) {
1697 mutex_enter(&pl
->pcl_lock
);
1698 if (pl
->pcl_state
== PCL_INVALID
) {
1699 /* Clean any invalid links while walking the list */
1700 *plpn
= pl
->pcl_parent_next
;
1701 pl
->pcl_child_pc
= NULL
;
1702 pl
->pcl_parent_next
= NULL
;
1703 pcachelink_locked_rele(pl
);
1704 } else if (pl
->pcl_parent_pc
== parent
) {
1705 /* Successfully found parent link */
1706 ASSERT(pl
->pcl_state
== PCL_VALID
||
1707 pl
->pcl_state
== PCL_STALE
);
1708 pl
->pcl_state
= PCL_VALID
;
1709 mutex_exit(&pl
->pcl_lock
);
1712 plpn
= &pl
->pcl_parent_next
;
1713 mutex_exit(&pl
->pcl_lock
);
1717 /* No existing link to the parent was found. Create a fresh one. */
1718 pl
= kmem_zalloc(sizeof (pcachelink_t
), KM_SLEEP
);
1719 mutex_init(&pl
->pcl_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
1721 pl
->pcl_parent_pc
= parent
;
1722 pl
->pcl_child_next
= parent
->pc_children
;
1723 parent
->pc_children
= pl
;
1726 pl
->pcl_child_pc
= child
;
1727 pl
->pcl_parent_next
= child
->pc_parents
;
1728 child
->pc_parents
= pl
;
1731 pl
->pcl_state
= PCL_VALID
;
1735 * Mark all child links in a pollcache as stale. Any invalid child links found
1736 * during iteration are purged.
1739 pcachelink_mark_stale(pollcache_t
*pcp
)
1741 pcachelink_t
*pl
, **plpn
;
1743 ASSERT(MUTEX_HELD(&pcp
->pc_lock
));
1745 plpn
= &pcp
->pc_children
;
1746 for (pl
= pcp
->pc_children
; pl
!= NULL
; pl
= *plpn
) {
1747 mutex_enter(&pl
->pcl_lock
);
1748 if (pl
->pcl_state
== PCL_INVALID
) {
1750 * Remove any invalid links while we are going to the
1751 * trouble of walking the list.
1753 *plpn
= pl
->pcl_child_next
;
1754 pl
->pcl_parent_pc
= NULL
;
1755 pl
->pcl_child_next
= NULL
;
1756 pcachelink_locked_rele(pl
);
1758 pl
->pcl_state
= PCL_STALE
;
1759 plpn
= &pl
->pcl_child_next
;
1760 mutex_exit(&pl
->pcl_lock
);
1766 * Purge all stale (or invalid) child links from a pollcache.
1769 pcachelink_purge_stale(pollcache_t
*pcp
)
1771 pcachelink_t
*pl
, **plpn
;
1773 ASSERT(MUTEX_HELD(&pcp
->pc_lock
));
1775 plpn
= &pcp
->pc_children
;
1776 for (pl
= pcp
->pc_children
; pl
!= NULL
; pl
= *plpn
) {
1777 mutex_enter(&pl
->pcl_lock
);
1778 switch (pl
->pcl_state
) {
1780 pl
->pcl_state
= PCL_INVALID
;
1783 *plpn
= pl
->pcl_child_next
;
1784 pl
->pcl_parent_pc
= NULL
;
1785 pl
->pcl_child_next
= NULL
;
1786 pcachelink_locked_rele(pl
);
1789 plpn
= &pl
->pcl_child_next
;
1790 mutex_exit(&pl
->pcl_lock
);
1796 * Purge all child and parent links from a pollcache, regardless of status.
1799 pcachelink_purge_all(pollcache_t
*pcp
)
1801 pcachelink_t
*pl
, **plpn
;
1803 ASSERT(MUTEX_HELD(&pcp
->pc_lock
));
1805 plpn
= &pcp
->pc_parents
;
1806 for (pl
= pcp
->pc_parents
; pl
!= NULL
; pl
= *plpn
) {
1807 mutex_enter(&pl
->pcl_lock
);
1808 pl
->pcl_state
= PCL_INVALID
;
1809 *plpn
= pl
->pcl_parent_next
;
1810 pl
->pcl_child_pc
= NULL
;
1811 pl
->pcl_parent_next
= NULL
;
1812 pcachelink_locked_rele(pl
);
1815 plpn
= &pcp
->pc_children
;
1816 for (pl
= pcp
->pc_children
; pl
!= NULL
; pl
= *plpn
) {
1817 mutex_enter(&pl
->pcl_lock
);
1818 pl
->pcl_state
= PCL_INVALID
;
1819 *plpn
= pl
->pcl_child_next
;
1820 pl
->pcl_parent_pc
= NULL
;
1821 pl
->pcl_child_next
= NULL
;
1822 pcachelink_locked_rele(pl
);
1825 ASSERT(pcp
->pc_parents
== NULL
);
1826 ASSERT(pcp
->pc_children
== NULL
);