15325 bhyve upstream sync 2023 January
[illumos-gate.git] / usr / src / cmd / bhyve / mevent.c
blobdcf82a5ce1278fb6829fae2ef88d290e0255210e
1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
4 * Copyright (c) 2011 NetApp, Inc.
5 * All rights reserved.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
16 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
28 * $FreeBSD$
32 * Copyright 2018 Joyent, Inc.
33 * Copyright 2022 OmniOS Community Edition (OmniOSce) Association.
37 * Micro event library for FreeBSD, designed for a single i/o thread
38 * using kqueue, and having events be persistent by default.
41 #include <sys/cdefs.h>
42 __FBSDID("$FreeBSD$");
44 #include <assert.h>
45 #ifndef WITHOUT_CAPSICUM
46 #include <capsicum_helpers.h>
47 #endif
48 #include <err.h>
49 #include <errno.h>
50 #include <stdbool.h>
51 #include <stdlib.h>
52 #include <stdio.h>
53 #include <string.h>
54 #include <sysexits.h>
55 #include <unistd.h>
57 #include <sys/types.h>
58 #ifndef WITHOUT_CAPSICUM
59 #include <sys/capsicum.h>
60 #endif
61 #ifdef __FreeBSD__
62 #include <sys/event.h>
63 #else
64 #include <port.h>
65 #include <sys/poll.h>
66 #include <sys/siginfo.h>
67 #include <sys/queue.h>
68 #include <sys/debug.h>
69 #include <sys/stat.h>
70 #endif
71 #include <sys/time.h>
73 #include <pthread.h>
74 #include <pthread_np.h>
76 #include "mevent.h"
78 #define MEVENT_MAX 64
80 #ifndef __FreeBSD__
81 #define EV_ENABLE 0x01
82 #define EV_ADD EV_ENABLE
83 #define EV_DISABLE 0x02
84 #define EV_DELETE 0x04
86 static int mevent_file_poll_interval_ms = 5000;
87 #endif
89 static pthread_t mevent_tid;
90 static pthread_once_t mevent_once = PTHREAD_ONCE_INIT;
91 #ifdef __FreeBSD__
92 static int mevent_timid = 43;
93 #endif
94 static int mevent_pipefd[2];
95 static int mfd;
96 static pthread_mutex_t mevent_lmutex = PTHREAD_MUTEX_INITIALIZER;
98 struct mevent {
99 void (*me_func)(int, enum ev_type, void *);
100 #define me_msecs me_fd
101 int me_fd;
102 #ifdef __FreeBSD__
103 int me_timid;
104 #else
105 timer_t me_timid;
106 #endif
107 enum ev_type me_type;
108 void *me_param;
109 int me_cq;
110 int me_state; /* Desired kevent flags. */
111 int me_closefd;
112 int me_fflags;
113 #ifndef __FreeBSD__
114 port_notify_t me_notify;
115 struct sigevent me_sigev;
116 boolean_t me_auto_requeue;
117 struct {
118 int mp_fd;
119 off_t mp_size;
120 void (*mp_func)(int, enum ev_type, void *);
121 void *mp_param;
122 } me_poll;
123 #endif
124 LIST_ENTRY(mevent) me_list;
127 static LIST_HEAD(listhead, mevent) global_head, change_head;
129 static void
130 mevent_qlock(void)
132 pthread_mutex_lock(&mevent_lmutex);
135 static void
136 mevent_qunlock(void)
138 pthread_mutex_unlock(&mevent_lmutex);
141 static void
142 mevent_pipe_read(int fd, enum ev_type type __unused, void *param __unused)
144 char buf[MEVENT_MAX];
145 int status;
148 * Drain the pipe read side. The fd is non-blocking so this is
149 * safe to do.
151 do {
152 status = read(fd, buf, sizeof(buf));
153 } while (status == MEVENT_MAX);
156 static void
157 mevent_notify(void)
159 char c = '\0';
162 * If calling from outside the i/o thread, write a byte on the
163 * pipe to force the i/o thread to exit the blocking kevent call.
165 if (mevent_pipefd[1] != 0 && pthread_self() != mevent_tid) {
166 write(mevent_pipefd[1], &c, 1);
170 static void
171 mevent_init(void)
173 #ifndef WITHOUT_CAPSICUM
174 cap_rights_t rights;
175 #endif
177 #ifdef __FreeBSD__
178 mfd = kqueue();
179 #else
180 mfd = port_create();
181 #endif
182 assert(mfd > 0);
184 #ifndef WITHOUT_CAPSICUM
185 cap_rights_init(&rights, CAP_KQUEUE);
186 if (caph_rights_limit(mfd, &rights) == -1)
187 errx(EX_OSERR, "Unable to apply rights for sandbox");
188 #endif
190 LIST_INIT(&change_head);
191 LIST_INIT(&global_head);
195 #ifdef __FreeBSD__
196 static int
197 mevent_kq_filter(struct mevent *mevp)
199 int retval;
201 retval = 0;
203 if (mevp->me_type == EVF_READ)
204 retval = EVFILT_READ;
206 if (mevp->me_type == EVF_WRITE)
207 retval = EVFILT_WRITE;
209 if (mevp->me_type == EVF_TIMER)
210 retval = EVFILT_TIMER;
212 if (mevp->me_type == EVF_SIGNAL)
213 retval = EVFILT_SIGNAL;
215 if (mevp->me_type == EVF_VNODE)
216 retval = EVFILT_VNODE;
218 return (retval);
221 static int
222 mevent_kq_flags(struct mevent *mevp)
224 int retval;
226 retval = mevp->me_state;
228 if (mevp->me_type == EVF_VNODE)
229 retval |= EV_CLEAR;
231 return (retval);
234 static int
235 mevent_kq_fflags(struct mevent *mevp)
237 int retval;
239 retval = 0;
241 switch (mevp->me_type) {
242 case EVF_VNODE:
243 if ((mevp->me_fflags & EVFF_ATTRIB) != 0)
244 retval |= NOTE_ATTRIB;
245 break;
246 case EVF_READ:
247 case EVF_WRITE:
248 case EVF_TIMER:
249 case EVF_SIGNAL:
250 break;
253 return (retval);
256 static void
257 mevent_populate(struct mevent *mevp, struct kevent *kev)
259 if (mevp->me_type == EVF_TIMER) {
260 kev->ident = mevp->me_timid;
261 kev->data = mevp->me_msecs;
262 } else {
263 kev->ident = mevp->me_fd;
264 kev->data = 0;
266 kev->filter = mevent_kq_filter(mevp);
267 kev->flags = mevent_kq_flags(mevp);
268 kev->fflags = mevent_kq_fflags(mevp);
269 kev->udata = mevp;
272 static int
273 mevent_build(struct kevent *kev)
275 struct mevent *mevp, *tmpp;
276 int i;
278 i = 0;
280 mevent_qlock();
282 LIST_FOREACH_SAFE(mevp, &change_head, me_list, tmpp) {
283 if (mevp->me_closefd) {
285 * A close of the file descriptor will remove the
286 * event
288 close(mevp->me_fd);
289 } else {
290 assert((mevp->me_state & EV_ADD) == 0);
291 mevent_populate(mevp, &kev[i]);
292 i++;
295 mevp->me_cq = 0;
296 LIST_REMOVE(mevp, me_list);
298 if (mevp->me_state & EV_DELETE) {
299 free(mevp);
300 } else {
301 LIST_INSERT_HEAD(&global_head, mevp, me_list);
304 assert(i < MEVENT_MAX);
307 mevent_qunlock();
309 return (i);
312 static void
313 mevent_handle(struct kevent *kev, int numev)
315 struct mevent *mevp;
316 int i;
318 for (i = 0; i < numev; i++) {
319 mevp = kev[i].udata;
321 /* XXX check for EV_ERROR ? */
323 (*mevp->me_func)(mevp->me_fd, mevp->me_type, mevp->me_param);
327 #else /* __FreeBSD__ */
329 static boolean_t
330 mevent_clarify_state(struct mevent *mevp)
332 const int state = mevp->me_state;
334 if ((state & EV_DELETE) != 0) {
335 /* All other intents are overriden by delete. */
336 mevp->me_state = EV_DELETE;
337 return (B_TRUE);
341 * Without a distinction between EV_ADD and EV_ENABLE in our emulation,
342 * handling the add-disabled case means eliding the portfs operation
343 * when both flags are present.
345 * This is not a concern for subsequent enable/disable operations, as
346 * mevent_update() toggles the flags properly so they are not left in
347 * conflict.
349 if (state == (EV_ENABLE|EV_DISABLE)) {
350 mevp->me_state = EV_DISABLE;
351 return (B_FALSE);
354 return (B_TRUE);
357 static void
358 mevent_poll_file_attrib(int fd, enum ev_type type, void *param)
360 struct mevent *mevp = param;
361 struct stat st;
363 if (fstat(mevp->me_poll.mp_fd, &st) != 0) {
364 (void) fprintf(stderr, "%s: fstat(%d) failed: %s\n",
365 __func__, fd, strerror(errno));
366 return;
370 * The only current consumer of file attribute monitoring is
371 * blockif, which wants to know about size changes.
373 if (mevp->me_poll.mp_size != st.st_size) {
374 mevp->me_poll.mp_size = st.st_size;
376 (*mevp->me_poll.mp_func)(mevp->me_poll.mp_fd, EVF_VNODE,
377 mevp->me_poll.mp_param);
381 static void
382 mevent_update_one_readwrite(struct mevent *mevp)
384 int portfd = mevp->me_notify.portnfy_port;
386 mevp->me_auto_requeue = B_FALSE;
388 switch (mevp->me_state) {
389 case EV_ENABLE:
391 const int events = (mevp->me_type == EVF_READ) ?
392 POLLIN : POLLOUT;
394 if (port_associate(portfd, PORT_SOURCE_FD, mevp->me_fd,
395 events, mevp) != 0) {
396 (void) fprintf(stderr,
397 "port_associate fd %d %p failed: %s\n",
398 mevp->me_fd, mevp, strerror(errno));
400 return;
402 case EV_DISABLE:
403 case EV_DELETE:
405 * A disable that comes in while an event is being
406 * handled will result in an ENOENT.
408 if (port_dissociate(portfd, PORT_SOURCE_FD,
409 mevp->me_fd) != 0 && errno != ENOENT) {
410 (void) fprintf(stderr, "port_dissociate "
411 "portfd %d fd %d mevp %p failed: %s\n",
412 portfd, mevp->me_fd, mevp, strerror(errno));
414 return;
415 default:
416 (void) fprintf(stderr, "%s: unhandled state %d\n", __func__,
417 mevp->me_state);
418 abort();
422 static void
423 mevent_update_one_timer(struct mevent *mevp)
425 mevp->me_auto_requeue = B_TRUE;
427 switch (mevp->me_state) {
428 case EV_ENABLE:
430 struct itimerspec it = { 0 };
432 mevp->me_sigev.sigev_notify = SIGEV_PORT;
433 mevp->me_sigev.sigev_value.sival_ptr = &mevp->me_notify;
435 if (timer_create(CLOCK_REALTIME, &mevp->me_sigev,
436 &mevp->me_timid) != 0) {
437 (void) fprintf(stderr, "timer_create failed: %s",
438 strerror(errno));
439 return;
442 /* The first timeout */
443 it.it_value.tv_sec = mevp->me_msecs / MILLISEC;
444 it.it_value.tv_nsec =
445 MSEC2NSEC(mevp->me_msecs % MILLISEC);
446 /* Repeat at the same interval */
447 it.it_interval = it.it_value;
449 if (timer_settime(mevp->me_timid, 0, &it, NULL) != 0) {
450 (void) fprintf(stderr, "timer_settime failed: %s",
451 strerror(errno));
453 return;
455 case EV_DISABLE:
456 case EV_DELETE:
457 if (timer_delete(mevp->me_timid) != 0) {
458 (void) fprintf(stderr, "timer_delete failed: %s",
459 strerror(errno));
461 mevp->me_timid = -1;
462 return;
463 default:
464 (void) fprintf(stderr, "%s: unhandled state %d\n", __func__,
465 mevp->me_state);
466 abort();
470 static void
471 mevent_update_one_vnode(struct mevent *mevp)
473 switch (mevp->me_state) {
474 case EV_ENABLE:
476 struct stat st;
477 int events = 0;
479 if ((mevp->me_fflags & EVFF_ATTRIB) != 0)
480 events |= FILE_ATTRIB;
482 assert(events != 0);
485 * It is tempting to use the PORT_SOURCE_FILE type for this in
486 * conjunction with the FILE_ATTRIB event type. Unfortunately
487 * this event type triggers on any change to the file's
488 * ctime, and therefore for every write as well as attribute
489 * changes. It also does not work for ZVOLs.
491 * Convert this to a timer event and poll for the file
492 * attribute changes that we care about.
495 if (fstat(mevp->me_fd, &st) != 0) {
496 (void) fprintf(stderr, "fstat(%d) failed: %s\n",
497 mevp->me_fd, strerror(errno));
498 return;
501 mevp->me_poll.mp_fd = mevp->me_fd;
502 mevp->me_poll.mp_size = st.st_size;
504 mevp->me_poll.mp_func = mevp->me_func;
505 mevp->me_poll.mp_param = mevp->me_param;
506 mevp->me_func = mevent_poll_file_attrib;
507 mevp->me_param = mevp;
509 mevp->me_type = EVF_TIMER;
510 mevp->me_timid = -1;
511 mevp->me_msecs = mevent_file_poll_interval_ms;
512 mevent_update_one_timer(mevp);
514 return;
516 case EV_DISABLE:
517 case EV_DELETE:
519 * These events do not really exist as they are converted to
520 * timers; fall through to abort.
522 default:
523 (void) fprintf(stderr, "%s: unhandled state %d\n", __func__,
524 mevp->me_state);
525 abort();
529 static void
530 mevent_update_one(struct mevent *mevp)
532 switch (mevp->me_type) {
533 case EVF_READ:
534 case EVF_WRITE:
535 mevent_update_one_readwrite(mevp);
536 break;
537 case EVF_TIMER:
538 mevent_update_one_timer(mevp);
539 break;
540 case EVF_VNODE:
541 mevent_update_one_vnode(mevp);
542 break;
543 case EVF_SIGNAL: /* EVF_SIGNAL not yet implemented. */
544 default:
545 (void) fprintf(stderr, "%s: unhandled event type %d\n",
546 __func__, mevp->me_type);
547 abort();
551 static void
552 mevent_populate(struct mevent *mevp)
554 mevp->me_notify.portnfy_port = mfd;
555 mevp->me_notify.portnfy_user = mevp;
558 static void
559 mevent_update_pending()
561 struct mevent *mevp, *tmpp;
563 mevent_qlock();
565 LIST_FOREACH_SAFE(mevp, &change_head, me_list, tmpp) {
566 mevent_populate(mevp);
567 if (mevp->me_closefd) {
569 * A close of the file descriptor will remove the
570 * event
572 (void) close(mevp->me_fd);
573 mevp->me_fd = -1;
574 } else {
575 if (mevent_clarify_state(mevp)) {
576 mevent_update_one(mevp);
580 mevp->me_cq = 0;
581 LIST_REMOVE(mevp, me_list);
583 if (mevp->me_state & EV_DELETE) {
584 free(mevp);
585 } else {
586 LIST_INSERT_HEAD(&global_head, mevp, me_list);
590 mevent_qunlock();
593 static void
594 mevent_handle_pe(port_event_t *pe)
596 struct mevent *mevp = pe->portev_user;
598 (*mevp->me_func)(mevp->me_fd, mevp->me_type, mevp->me_param);
600 mevent_qlock();
601 if (!mevp->me_cq && !mevp->me_auto_requeue) {
602 mevent_update_one(mevp);
604 mevent_qunlock();
606 #endif
608 static struct mevent *
609 mevent_add_state(int tfd, enum ev_type type,
610 void (*func)(int, enum ev_type, void *), void *param,
611 int state, int fflags)
613 #ifdef __FreeBSD__
614 struct kevent kev;
615 #endif
616 struct mevent *lp, *mevp;
617 #ifdef __FreeBSD__
618 int ret;
619 #endif
621 if (tfd < 0 || func == NULL) {
622 return (NULL);
625 mevp = NULL;
627 pthread_once(&mevent_once, mevent_init);
629 mevent_qlock();
632 * Verify that the fd/type tuple is not present in any list
634 LIST_FOREACH(lp, &global_head, me_list) {
635 if (type != EVF_TIMER && lp->me_fd == tfd &&
636 lp->me_type == type) {
637 goto exit;
641 LIST_FOREACH(lp, &change_head, me_list) {
642 if (type != EVF_TIMER && lp->me_fd == tfd &&
643 lp->me_type == type) {
644 goto exit;
649 * Allocate an entry and populate it.
651 mevp = calloc(1, sizeof(struct mevent));
652 if (mevp == NULL) {
653 goto exit;
656 if (type == EVF_TIMER) {
657 mevp->me_msecs = tfd;
658 #ifdef __FreeBSD__
659 mevp->me_timid = mevent_timid++;
660 #else
661 mevp->me_timid = -1;
662 #endif
663 } else
664 mevp->me_fd = tfd;
665 mevp->me_type = type;
666 mevp->me_func = func;
667 mevp->me_param = param;
668 mevp->me_state = state;
669 mevp->me_fflags = fflags;
672 * Try to add the event. If this fails, report the failure to
673 * the caller.
675 #ifdef __FreeBSD__
676 mevent_populate(mevp, &kev);
677 ret = kevent(mfd, &kev, 1, NULL, 0, NULL);
678 if (ret == -1) {
679 free(mevp);
680 mevp = NULL;
681 goto exit;
683 mevp->me_state &= ~EV_ADD;
684 #else
685 mevent_populate(mevp);
686 if (mevent_clarify_state(mevp))
687 mevent_update_one(mevp);
688 #endif
690 LIST_INSERT_HEAD(&global_head, mevp, me_list);
692 exit:
693 mevent_qunlock();
695 return (mevp);
698 struct mevent *
699 mevent_add(int tfd, enum ev_type type,
700 void (*func)(int, enum ev_type, void *), void *param)
703 return (mevent_add_state(tfd, type, func, param, EV_ADD, 0));
706 struct mevent *
707 mevent_add_flags(int tfd, enum ev_type type, int fflags,
708 void (*func)(int, enum ev_type, void *), void *param)
711 return (mevent_add_state(tfd, type, func, param, EV_ADD, fflags));
714 struct mevent *
715 mevent_add_disabled(int tfd, enum ev_type type,
716 void (*func)(int, enum ev_type, void *), void *param)
719 return (mevent_add_state(tfd, type, func, param, EV_ADD | EV_DISABLE, 0));
722 static int
723 mevent_update(struct mevent *evp, bool enable)
725 int newstate;
727 mevent_qlock();
730 * It's not possible to enable/disable a deleted event
732 assert((evp->me_state & EV_DELETE) == 0);
734 newstate = evp->me_state;
735 if (enable) {
736 newstate |= EV_ENABLE;
737 newstate &= ~EV_DISABLE;
738 } else {
739 newstate |= EV_DISABLE;
740 newstate &= ~EV_ENABLE;
744 * No update needed if state isn't changing
746 if (evp->me_state != newstate) {
747 evp->me_state = newstate;
750 * Place the entry onto the changed list if not
751 * already there.
753 if (evp->me_cq == 0) {
754 evp->me_cq = 1;
755 LIST_REMOVE(evp, me_list);
756 LIST_INSERT_HEAD(&change_head, evp, me_list);
757 mevent_notify();
761 mevent_qunlock();
763 return (0);
767 mevent_enable(struct mevent *evp)
770 return (mevent_update(evp, true));
774 mevent_disable(struct mevent *evp)
777 return (mevent_update(evp, false));
780 static int
781 mevent_delete_event(struct mevent *evp, int closefd)
783 mevent_qlock();
786 * Place the entry onto the changed list if not already there, and
787 * mark as to be deleted.
789 if (evp->me_cq == 0) {
790 evp->me_cq = 1;
791 LIST_REMOVE(evp, me_list);
792 LIST_INSERT_HEAD(&change_head, evp, me_list);
793 mevent_notify();
795 evp->me_state = EV_DELETE;
797 if (closefd)
798 evp->me_closefd = 1;
800 mevent_qunlock();
802 return (0);
806 mevent_delete(struct mevent *evp)
809 return (mevent_delete_event(evp, 0));
813 mevent_delete_close(struct mevent *evp)
816 return (mevent_delete_event(evp, 1));
819 static void
820 mevent_set_name(void)
823 pthread_set_name_np(mevent_tid, "mevent");
826 void
827 mevent_dispatch(void)
829 #ifdef __FreeBSD__
830 struct kevent changelist[MEVENT_MAX];
831 struct kevent eventlist[MEVENT_MAX];
832 struct mevent *pipev;
833 int numev;
834 #else
835 struct mevent *pipev;
836 #endif
837 int ret;
838 #ifndef WITHOUT_CAPSICUM
839 cap_rights_t rights;
840 #endif
842 mevent_tid = pthread_self();
843 mevent_set_name();
845 pthread_once(&mevent_once, mevent_init);
848 * Open the pipe that will be used for other threads to force
849 * the blocking kqueue call to exit by writing to it. Set the
850 * descriptor to non-blocking.
852 ret = pipe(mevent_pipefd);
853 if (ret < 0) {
854 perror("pipe");
855 exit(0);
858 #ifndef WITHOUT_CAPSICUM
859 cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE);
860 if (caph_rights_limit(mevent_pipefd[0], &rights) == -1)
861 errx(EX_OSERR, "Unable to apply rights for sandbox");
862 if (caph_rights_limit(mevent_pipefd[1], &rights) == -1)
863 errx(EX_OSERR, "Unable to apply rights for sandbox");
864 #endif
867 * Add internal event handler for the pipe write fd
869 pipev = mevent_add(mevent_pipefd[0], EVF_READ, mevent_pipe_read, NULL);
870 assert(pipev != NULL);
872 for (;;) {
873 #ifdef __FreeBSD__
875 * Build changelist if required.
876 * XXX the changelist can be put into the blocking call
877 * to eliminate the extra syscall. Currently better for
878 * debug.
880 numev = mevent_build(changelist);
881 if (numev) {
882 ret = kevent(mfd, changelist, numev, NULL, 0, NULL);
883 if (ret == -1) {
884 perror("Error return from kevent change");
889 * Block awaiting events
891 ret = kevent(mfd, NULL, 0, eventlist, MEVENT_MAX, NULL);
892 if (ret == -1 && errno != EINTR) {
893 perror("Error return from kevent monitor");
897 * Handle reported events
899 mevent_handle(eventlist, ret);
901 #else /* __FreeBSD__ */
902 port_event_t pev;
904 /* Handle any pending updates */
905 mevent_update_pending();
907 /* Block awaiting events */
908 ret = port_get(mfd, &pev, NULL);
909 if (ret != 0) {
910 if (errno != EINTR)
911 perror("Error return from port_get");
912 continue;
915 /* Handle reported event */
916 mevent_handle_pe(&pev);
917 #endif /* __FreeBSD__ */