docs/devel/qapi-code-gen: Turn FIXME admonitions into comments
[qemu/ar7.git] / util / aio-posix.c
bloba8be940f760dc2e005bbded9d340f0014de074db
1 /*
2 * QEMU aio implementation
4 * Copyright IBM, Corp. 2008
6 * Authors:
7 * Anthony Liguori <aliguori@us.ibm.com>
9 * This work is licensed under the terms of the GNU GPL, version 2. See
10 * the COPYING file in the top-level directory.
12 * Contributions after 2012-01-13 are licensed under the terms of the
13 * GNU GPL, version 2 or (at your option) any later version.
16 #include "qemu/osdep.h"
17 #include "block/block.h"
18 #include "block/thread-pool.h"
19 #include "qemu/main-loop.h"
20 #include "qemu/rcu.h"
21 #include "qemu/rcu_queue.h"
22 #include "qemu/sockets.h"
23 #include "qemu/cutils.h"
24 #include "trace.h"
25 #include "aio-posix.h"
27 /* Stop userspace polling on a handler if it isn't active for some time */
28 #define POLL_IDLE_INTERVAL_NS (7 * NANOSECONDS_PER_SECOND)
30 bool aio_poll_disabled(AioContext *ctx)
32 return qatomic_read(&ctx->poll_disable_cnt);
35 void aio_add_ready_handler(AioHandlerList *ready_list,
36 AioHandler *node,
37 int revents)
39 QLIST_SAFE_REMOVE(node, node_ready); /* remove from nested parent's list */
40 node->pfd.revents = revents;
41 QLIST_INSERT_HEAD(ready_list, node, node_ready);
44 static void aio_add_poll_ready_handler(AioHandlerList *ready_list,
45 AioHandler *node)
47 QLIST_SAFE_REMOVE(node, node_ready); /* remove from nested parent's list */
48 node->poll_ready = true;
49 QLIST_INSERT_HEAD(ready_list, node, node_ready);
52 static AioHandler *find_aio_handler(AioContext *ctx, int fd)
54 AioHandler *node;
56 QLIST_FOREACH(node, &ctx->aio_handlers, node) {
57 if (node->pfd.fd == fd) {
58 if (!QLIST_IS_INSERTED(node, node_deleted)) {
59 return node;
64 return NULL;
67 static bool aio_remove_fd_handler(AioContext *ctx, AioHandler *node)
69 /* If the GSource is in the process of being destroyed then
70 * g_source_remove_poll() causes an assertion failure. Skip
71 * removal in that case, because glib cleans up its state during
72 * destruction anyway.
74 if (!g_source_is_destroyed(&ctx->source)) {
75 g_source_remove_poll(&ctx->source, &node->pfd);
78 node->pfd.revents = 0;
79 node->poll_ready = false;
81 /* If the fd monitor has already marked it deleted, leave it alone */
82 if (QLIST_IS_INSERTED(node, node_deleted)) {
83 return false;
86 /* If a read is in progress, just mark the node as deleted */
87 if (qemu_lockcnt_count(&ctx->list_lock)) {
88 QLIST_INSERT_HEAD_RCU(&ctx->deleted_aio_handlers, node, node_deleted);
89 return false;
91 /* Otherwise, delete it for real. We can't just mark it as
92 * deleted because deleted nodes are only cleaned up while
93 * no one is walking the handlers list.
95 QLIST_SAFE_REMOVE(node, node_poll);
96 QLIST_REMOVE(node, node);
97 return true;
100 void aio_set_fd_handler(AioContext *ctx,
101 int fd,
102 bool is_external,
103 IOHandler *io_read,
104 IOHandler *io_write,
105 AioPollFn *io_poll,
106 IOHandler *io_poll_ready,
107 void *opaque)
109 AioHandler *node;
110 AioHandler *new_node = NULL;
111 bool is_new = false;
112 bool deleted = false;
113 int poll_disable_change;
115 if (io_poll && !io_poll_ready) {
116 io_poll = NULL; /* polling only makes sense if there is a handler */
119 qemu_lockcnt_lock(&ctx->list_lock);
121 node = find_aio_handler(ctx, fd);
123 /* Are we deleting the fd handler? */
124 if (!io_read && !io_write && !io_poll) {
125 if (node == NULL) {
126 qemu_lockcnt_unlock(&ctx->list_lock);
127 return;
129 /* Clean events in order to unregister fd from the ctx epoll. */
130 node->pfd.events = 0;
132 poll_disable_change = -!node->io_poll;
133 } else {
134 poll_disable_change = !io_poll - (node && !node->io_poll);
135 if (node == NULL) {
136 is_new = true;
138 /* Alloc and insert if it's not already there */
139 new_node = g_new0(AioHandler, 1);
141 /* Update handler with latest information */
142 new_node->io_read = io_read;
143 new_node->io_write = io_write;
144 new_node->io_poll = io_poll;
145 new_node->io_poll_ready = io_poll_ready;
146 new_node->opaque = opaque;
147 new_node->is_external = is_external;
149 if (is_new) {
150 new_node->pfd.fd = fd;
151 } else {
152 new_node->pfd = node->pfd;
154 g_source_add_poll(&ctx->source, &new_node->pfd);
156 new_node->pfd.events = (io_read ? G_IO_IN | G_IO_HUP | G_IO_ERR : 0);
157 new_node->pfd.events |= (io_write ? G_IO_OUT | G_IO_ERR : 0);
159 QLIST_INSERT_HEAD_RCU(&ctx->aio_handlers, new_node, node);
162 /* No need to order poll_disable_cnt writes against other updates;
163 * the counter is only used to avoid wasting time and latency on
164 * iterated polling when the system call will be ultimately necessary.
165 * Changing handlers is a rare event, and a little wasted polling until
166 * the aio_notify below is not an issue.
168 qatomic_set(&ctx->poll_disable_cnt,
169 qatomic_read(&ctx->poll_disable_cnt) + poll_disable_change);
171 ctx->fdmon_ops->update(ctx, node, new_node);
172 if (node) {
173 deleted = aio_remove_fd_handler(ctx, node);
175 qemu_lockcnt_unlock(&ctx->list_lock);
176 aio_notify(ctx);
178 if (deleted) {
179 g_free(node);
183 static void aio_set_fd_poll(AioContext *ctx, int fd,
184 IOHandler *io_poll_begin,
185 IOHandler *io_poll_end)
187 AioHandler *node = find_aio_handler(ctx, fd);
189 if (!node) {
190 return;
193 node->io_poll_begin = io_poll_begin;
194 node->io_poll_end = io_poll_end;
197 void aio_set_event_notifier(AioContext *ctx,
198 EventNotifier *notifier,
199 bool is_external,
200 EventNotifierHandler *io_read,
201 AioPollFn *io_poll,
202 EventNotifierHandler *io_poll_ready)
204 aio_set_fd_handler(ctx, event_notifier_get_fd(notifier), is_external,
205 (IOHandler *)io_read, NULL, io_poll,
206 (IOHandler *)io_poll_ready, notifier);
209 void aio_set_event_notifier_poll(AioContext *ctx,
210 EventNotifier *notifier,
211 EventNotifierHandler *io_poll_begin,
212 EventNotifierHandler *io_poll_end)
214 aio_set_fd_poll(ctx, event_notifier_get_fd(notifier),
215 (IOHandler *)io_poll_begin,
216 (IOHandler *)io_poll_end);
219 static bool poll_set_started(AioContext *ctx, AioHandlerList *ready_list,
220 bool started)
222 AioHandler *node;
223 bool progress = false;
225 if (started == ctx->poll_started) {
226 return false;
229 ctx->poll_started = started;
231 qemu_lockcnt_inc(&ctx->list_lock);
232 QLIST_FOREACH(node, &ctx->poll_aio_handlers, node_poll) {
233 IOHandler *fn;
235 if (QLIST_IS_INSERTED(node, node_deleted)) {
236 continue;
239 if (started) {
240 fn = node->io_poll_begin;
241 } else {
242 fn = node->io_poll_end;
245 if (fn) {
246 fn(node->opaque);
249 /* Poll one last time in case ->io_poll_end() raced with the event */
250 if (!started && node->io_poll(node->opaque)) {
251 aio_add_poll_ready_handler(ready_list, node);
252 progress = true;
255 qemu_lockcnt_dec(&ctx->list_lock);
257 return progress;
261 bool aio_prepare(AioContext *ctx)
263 AioHandlerList ready_list = QLIST_HEAD_INITIALIZER(ready_list);
265 /* Poll mode cannot be used with glib's event loop, disable it. */
266 poll_set_started(ctx, &ready_list, false);
267 /* TODO what to do with this list? */
269 return false;
272 bool aio_pending(AioContext *ctx)
274 AioHandler *node;
275 bool result = false;
278 * We have to walk very carefully in case aio_set_fd_handler is
279 * called while we're walking.
281 qemu_lockcnt_inc(&ctx->list_lock);
283 QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
284 int revents;
286 /* TODO should this check poll ready? */
287 revents = node->pfd.revents & node->pfd.events;
288 if (revents & (G_IO_IN | G_IO_HUP | G_IO_ERR) && node->io_read &&
289 aio_node_check(ctx, node->is_external)) {
290 result = true;
291 break;
293 if (revents & (G_IO_OUT | G_IO_ERR) && node->io_write &&
294 aio_node_check(ctx, node->is_external)) {
295 result = true;
296 break;
299 qemu_lockcnt_dec(&ctx->list_lock);
301 return result;
304 static void aio_free_deleted_handlers(AioContext *ctx)
306 AioHandler *node;
308 if (QLIST_EMPTY_RCU(&ctx->deleted_aio_handlers)) {
309 return;
311 if (!qemu_lockcnt_dec_if_lock(&ctx->list_lock)) {
312 return; /* we are nested, let the parent do the freeing */
315 while ((node = QLIST_FIRST_RCU(&ctx->deleted_aio_handlers))) {
316 QLIST_REMOVE(node, node);
317 QLIST_REMOVE(node, node_deleted);
318 QLIST_SAFE_REMOVE(node, node_poll);
319 g_free(node);
322 qemu_lockcnt_inc_and_unlock(&ctx->list_lock);
325 static bool aio_dispatch_handler(AioContext *ctx, AioHandler *node)
327 bool progress = false;
328 bool poll_ready;
329 int revents;
331 revents = node->pfd.revents & node->pfd.events;
332 node->pfd.revents = 0;
334 poll_ready = node->poll_ready;
335 node->poll_ready = false;
338 * Start polling AioHandlers when they become ready because activity is
339 * likely to continue. Note that starvation is theoretically possible when
340 * fdmon_supports_polling(), but only until the fd fires for the first
341 * time.
343 if (!QLIST_IS_INSERTED(node, node_deleted) &&
344 !QLIST_IS_INSERTED(node, node_poll) &&
345 node->io_poll) {
346 trace_poll_add(ctx, node, node->pfd.fd, revents);
347 if (ctx->poll_started && node->io_poll_begin) {
348 node->io_poll_begin(node->opaque);
350 QLIST_INSERT_HEAD(&ctx->poll_aio_handlers, node, node_poll);
352 if (!QLIST_IS_INSERTED(node, node_deleted) &&
353 poll_ready && revents == 0 &&
354 aio_node_check(ctx, node->is_external) &&
355 node->io_poll_ready) {
356 node->io_poll_ready(node->opaque);
359 * Return early since revents was zero. aio_notify() does not count as
360 * progress.
362 return node->opaque != &ctx->notifier;
365 if (!QLIST_IS_INSERTED(node, node_deleted) &&
366 (revents & (G_IO_IN | G_IO_HUP | G_IO_ERR)) &&
367 aio_node_check(ctx, node->is_external) &&
368 node->io_read) {
369 node->io_read(node->opaque);
371 /* aio_notify() does not count as progress */
372 if (node->opaque != &ctx->notifier) {
373 progress = true;
376 if (!QLIST_IS_INSERTED(node, node_deleted) &&
377 (revents & (G_IO_OUT | G_IO_ERR)) &&
378 aio_node_check(ctx, node->is_external) &&
379 node->io_write) {
380 node->io_write(node->opaque);
381 progress = true;
384 return progress;
388 * If we have a list of ready handlers then this is more efficient than
389 * scanning all handlers with aio_dispatch_handlers().
391 static bool aio_dispatch_ready_handlers(AioContext *ctx,
392 AioHandlerList *ready_list)
394 bool progress = false;
395 AioHandler *node;
397 while ((node = QLIST_FIRST(ready_list))) {
398 QLIST_REMOVE(node, node_ready);
399 progress = aio_dispatch_handler(ctx, node) || progress;
402 return progress;
405 /* Slower than aio_dispatch_ready_handlers() but only used via glib */
406 static bool aio_dispatch_handlers(AioContext *ctx)
408 AioHandler *node, *tmp;
409 bool progress = false;
411 QLIST_FOREACH_SAFE_RCU(node, &ctx->aio_handlers, node, tmp) {
412 progress = aio_dispatch_handler(ctx, node) || progress;
415 return progress;
418 void aio_dispatch(AioContext *ctx)
420 qemu_lockcnt_inc(&ctx->list_lock);
421 aio_bh_poll(ctx);
422 aio_dispatch_handlers(ctx);
423 aio_free_deleted_handlers(ctx);
424 qemu_lockcnt_dec(&ctx->list_lock);
426 timerlistgroup_run_timers(&ctx->tlg);
429 static bool run_poll_handlers_once(AioContext *ctx,
430 AioHandlerList *ready_list,
431 int64_t now,
432 int64_t *timeout)
434 bool progress = false;
435 AioHandler *node;
436 AioHandler *tmp;
438 QLIST_FOREACH_SAFE(node, &ctx->poll_aio_handlers, node_poll, tmp) {
439 if (aio_node_check(ctx, node->is_external) &&
440 node->io_poll(node->opaque)) {
441 aio_add_poll_ready_handler(ready_list, node);
443 node->poll_idle_timeout = now + POLL_IDLE_INTERVAL_NS;
446 * Polling was successful, exit try_poll_mode immediately
447 * to adjust the next polling time.
449 *timeout = 0;
450 if (node->opaque != &ctx->notifier) {
451 progress = true;
455 /* Caller handles freeing deleted nodes. Don't do it here. */
458 return progress;
461 static bool fdmon_supports_polling(AioContext *ctx)
463 return ctx->fdmon_ops->need_wait != aio_poll_disabled;
466 static bool remove_idle_poll_handlers(AioContext *ctx,
467 AioHandlerList *ready_list,
468 int64_t now)
470 AioHandler *node;
471 AioHandler *tmp;
472 bool progress = false;
475 * File descriptor monitoring implementations without userspace polling
476 * support suffer from starvation when a subset of handlers is polled
477 * because fds will not be processed in a timely fashion. Don't remove
478 * idle poll handlers.
480 if (!fdmon_supports_polling(ctx)) {
481 return false;
484 QLIST_FOREACH_SAFE(node, &ctx->poll_aio_handlers, node_poll, tmp) {
485 if (node->poll_idle_timeout == 0LL) {
486 node->poll_idle_timeout = now + POLL_IDLE_INTERVAL_NS;
487 } else if (now >= node->poll_idle_timeout) {
488 trace_poll_remove(ctx, node, node->pfd.fd);
489 node->poll_idle_timeout = 0LL;
490 QLIST_SAFE_REMOVE(node, node_poll);
491 if (ctx->poll_started && node->io_poll_end) {
492 node->io_poll_end(node->opaque);
495 * Final poll in case ->io_poll_end() races with an event.
496 * Nevermind about re-adding the handler in the rare case where
497 * this causes progress.
499 if (node->io_poll(node->opaque)) {
500 aio_add_poll_ready_handler(ready_list, node);
501 progress = true;
507 return progress;
510 /* run_poll_handlers:
511 * @ctx: the AioContext
512 * @ready_list: the list to place ready handlers on
513 * @max_ns: maximum time to poll for, in nanoseconds
515 * Polls for a given time.
517 * Note that the caller must have incremented ctx->list_lock.
519 * Returns: true if progress was made, false otherwise
521 static bool run_poll_handlers(AioContext *ctx, AioHandlerList *ready_list,
522 int64_t max_ns, int64_t *timeout)
524 bool progress;
525 int64_t start_time, elapsed_time;
527 assert(qemu_lockcnt_count(&ctx->list_lock) > 0);
529 trace_run_poll_handlers_begin(ctx, max_ns, *timeout);
532 * Optimization: ->io_poll() handlers often contain RCU read critical
533 * sections and we therefore see many rcu_read_lock() -> rcu_read_unlock()
534 * -> rcu_read_lock() -> ... sequences with expensive memory
535 * synchronization primitives. Make the entire polling loop an RCU
536 * critical section because nested rcu_read_lock()/rcu_read_unlock() calls
537 * are cheap.
539 RCU_READ_LOCK_GUARD();
541 start_time = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
542 do {
543 progress = run_poll_handlers_once(ctx, ready_list,
544 start_time, timeout);
545 elapsed_time = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - start_time;
546 max_ns = qemu_soonest_timeout(*timeout, max_ns);
547 assert(!(max_ns && progress));
548 } while (elapsed_time < max_ns && !ctx->fdmon_ops->need_wait(ctx));
550 if (remove_idle_poll_handlers(ctx, ready_list,
551 start_time + elapsed_time)) {
552 *timeout = 0;
553 progress = true;
556 /* If time has passed with no successful polling, adjust *timeout to
557 * keep the same ending time.
559 if (*timeout != -1) {
560 *timeout -= MIN(*timeout, elapsed_time);
563 trace_run_poll_handlers_end(ctx, progress, *timeout);
564 return progress;
567 /* try_poll_mode:
568 * @ctx: the AioContext
569 * @ready_list: list to add handlers that need to be run
570 * @timeout: timeout for blocking wait, computed by the caller and updated if
571 * polling succeeds.
573 * Note that the caller must have incremented ctx->list_lock.
575 * Returns: true if progress was made, false otherwise
577 static bool try_poll_mode(AioContext *ctx, AioHandlerList *ready_list,
578 int64_t *timeout)
580 int64_t max_ns;
582 if (QLIST_EMPTY_RCU(&ctx->poll_aio_handlers)) {
583 return false;
586 max_ns = qemu_soonest_timeout(*timeout, ctx->poll_ns);
587 if (max_ns && !ctx->fdmon_ops->need_wait(ctx)) {
589 * Enable poll mode. It pairs with the poll_set_started() in
590 * aio_poll() which disables poll mode.
592 poll_set_started(ctx, ready_list, true);
594 if (run_poll_handlers(ctx, ready_list, max_ns, timeout)) {
595 return true;
598 return false;
601 bool aio_poll(AioContext *ctx, bool blocking)
603 AioHandlerList ready_list = QLIST_HEAD_INITIALIZER(ready_list);
604 bool progress;
605 bool use_notify_me;
606 int64_t timeout;
607 int64_t start = 0;
610 * There cannot be two concurrent aio_poll calls for the same AioContext (or
611 * an aio_poll concurrent with a GSource prepare/check/dispatch callback).
612 * We rely on this below to avoid slow locked accesses to ctx->notify_me.
614 * aio_poll() may only be called in the AioContext's thread. iohandler_ctx
615 * is special in that it runs in the main thread, but that thread's context
616 * is qemu_aio_context.
618 assert(in_aio_context_home_thread(ctx == iohandler_get_aio_context() ?
619 qemu_get_aio_context() : ctx));
621 qemu_lockcnt_inc(&ctx->list_lock);
623 if (ctx->poll_max_ns) {
624 start = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
627 timeout = blocking ? aio_compute_timeout(ctx) : 0;
628 progress = try_poll_mode(ctx, &ready_list, &timeout);
629 assert(!(timeout && progress));
632 * aio_notify can avoid the expensive event_notifier_set if
633 * everything (file descriptors, bottom halves, timers) will
634 * be re-evaluated before the next blocking poll(). This is
635 * already true when aio_poll is called with blocking == false;
636 * if blocking == true, it is only true after poll() returns,
637 * so disable the optimization now.
639 use_notify_me = timeout != 0;
640 if (use_notify_me) {
641 qatomic_set(&ctx->notify_me, qatomic_read(&ctx->notify_me) + 2);
643 * Write ctx->notify_me before reading ctx->notified. Pairs with
644 * smp_mb in aio_notify().
646 smp_mb();
648 /* Don't block if aio_notify() was called */
649 if (qatomic_read(&ctx->notified)) {
650 timeout = 0;
654 /* If polling is allowed, non-blocking aio_poll does not need the
655 * system call---a single round of run_poll_handlers_once suffices.
657 if (timeout || ctx->fdmon_ops->need_wait(ctx)) {
659 * Disable poll mode. poll mode should be disabled before the call
660 * of ctx->fdmon_ops->wait() so that guest's notification can wake
661 * up IO threads when some work becomes pending. It is essential to
662 * avoid hangs or unnecessary latency.
664 if (poll_set_started(ctx, &ready_list, false)) {
665 timeout = 0;
666 progress = true;
669 ctx->fdmon_ops->wait(ctx, &ready_list, timeout);
672 if (use_notify_me) {
673 /* Finish the poll before clearing the flag. */
674 qatomic_store_release(&ctx->notify_me,
675 qatomic_read(&ctx->notify_me) - 2);
678 aio_notify_accept(ctx);
680 /* Adjust polling time */
681 if (ctx->poll_max_ns) {
682 int64_t block_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - start;
684 if (block_ns <= ctx->poll_ns) {
685 /* This is the sweet spot, no adjustment needed */
686 } else if (block_ns > ctx->poll_max_ns) {
687 /* We'd have to poll for too long, poll less */
688 int64_t old = ctx->poll_ns;
690 if (ctx->poll_shrink) {
691 ctx->poll_ns /= ctx->poll_shrink;
692 } else {
693 ctx->poll_ns = 0;
696 trace_poll_shrink(ctx, old, ctx->poll_ns);
697 } else if (ctx->poll_ns < ctx->poll_max_ns &&
698 block_ns < ctx->poll_max_ns) {
699 /* There is room to grow, poll longer */
700 int64_t old = ctx->poll_ns;
701 int64_t grow = ctx->poll_grow;
703 if (grow == 0) {
704 grow = 2;
707 if (ctx->poll_ns) {
708 ctx->poll_ns *= grow;
709 } else {
710 ctx->poll_ns = 4000; /* start polling at 4 microseconds */
713 if (ctx->poll_ns > ctx->poll_max_ns) {
714 ctx->poll_ns = ctx->poll_max_ns;
717 trace_poll_grow(ctx, old, ctx->poll_ns);
721 progress |= aio_bh_poll(ctx);
722 progress |= aio_dispatch_ready_handlers(ctx, &ready_list);
724 aio_free_deleted_handlers(ctx);
726 qemu_lockcnt_dec(&ctx->list_lock);
728 progress |= timerlistgroup_run_timers(&ctx->tlg);
730 return progress;
733 void aio_context_setup(AioContext *ctx)
735 ctx->fdmon_ops = &fdmon_poll_ops;
736 ctx->epollfd = -1;
738 /* Use the fastest fd monitoring implementation if available */
739 if (fdmon_io_uring_setup(ctx)) {
740 return;
743 fdmon_epoll_setup(ctx);
746 void aio_context_destroy(AioContext *ctx)
748 fdmon_io_uring_destroy(ctx);
749 fdmon_epoll_disable(ctx);
750 aio_free_deleted_handlers(ctx);
753 void aio_context_use_g_source(AioContext *ctx)
756 * Disable io_uring when the glib main loop is used because it doesn't
757 * support mixed glib/aio_poll() usage. It relies on aio_poll() being
758 * called regularly so that changes to the monitored file descriptors are
759 * submitted, otherwise a list of pending fd handlers builds up.
761 fdmon_io_uring_destroy(ctx);
762 aio_free_deleted_handlers(ctx);
765 void aio_context_set_poll_params(AioContext *ctx, int64_t max_ns,
766 int64_t grow, int64_t shrink, Error **errp)
768 /* No thread synchronization here, it doesn't matter if an incorrect value
769 * is used once.
771 ctx->poll_max_ns = max_ns;
772 ctx->poll_ns = 0;
773 ctx->poll_grow = grow;
774 ctx->poll_shrink = shrink;
776 aio_notify(ctx);
779 void aio_context_set_aio_params(AioContext *ctx, int64_t max_batch,
780 Error **errp)
783 * No thread synchronization here, it doesn't matter if an incorrect value
784 * is used once.
786 ctx->aio_max_batch = max_batch;
788 aio_notify(ctx);