vmdk: switch to *_co_* functions
[qemu/kevin.git] / util / aio-posix.c
blob731f3826c062df3df4caac5f45d26d203489ee04
1 /*
2 * QEMU aio implementation
4 * Copyright IBM, Corp. 2008
6 * Authors:
7 * Anthony Liguori <aliguori@us.ibm.com>
9 * This work is licensed under the terms of the GNU GPL, version 2. See
10 * the COPYING file in the top-level directory.
12 * Contributions after 2012-01-13 are licensed under the terms of the
13 * GNU GPL, version 2 or (at your option) any later version.
16 #include "qemu/osdep.h"
17 #include "block/block.h"
18 #include "block/thread-pool.h"
19 #include "qemu/main-loop.h"
20 #include "qemu/rcu.h"
21 #include "qemu/rcu_queue.h"
22 #include "qemu/sockets.h"
23 #include "qemu/cutils.h"
24 #include "trace.h"
25 #include "aio-posix.h"
27 /* Stop userspace polling on a handler if it isn't active for some time */
28 #define POLL_IDLE_INTERVAL_NS (7 * NANOSECONDS_PER_SECOND)
30 bool aio_poll_disabled(AioContext *ctx)
32 return qatomic_read(&ctx->poll_disable_cnt);
35 void aio_add_ready_handler(AioHandlerList *ready_list,
36 AioHandler *node,
37 int revents)
39 QLIST_SAFE_REMOVE(node, node_ready); /* remove from nested parent's list */
40 node->pfd.revents = revents;
41 QLIST_INSERT_HEAD(ready_list, node, node_ready);
44 static void aio_add_poll_ready_handler(AioHandlerList *ready_list,
45 AioHandler *node)
47 QLIST_SAFE_REMOVE(node, node_ready); /* remove from nested parent's list */
48 node->poll_ready = true;
49 QLIST_INSERT_HEAD(ready_list, node, node_ready);
52 static AioHandler *find_aio_handler(AioContext *ctx, int fd)
54 AioHandler *node;
56 QLIST_FOREACH(node, &ctx->aio_handlers, node) {
57 if (node->pfd.fd == fd) {
58 if (!QLIST_IS_INSERTED(node, node_deleted)) {
59 return node;
64 return NULL;
67 static bool aio_remove_fd_handler(AioContext *ctx, AioHandler *node)
69 /* If the GSource is in the process of being destroyed then
70 * g_source_remove_poll() causes an assertion failure. Skip
71 * removal in that case, because glib cleans up its state during
72 * destruction anyway.
74 if (!g_source_is_destroyed(&ctx->source)) {
75 g_source_remove_poll(&ctx->source, &node->pfd);
78 node->pfd.revents = 0;
79 node->poll_ready = false;
81 /* If the fd monitor has already marked it deleted, leave it alone */
82 if (QLIST_IS_INSERTED(node, node_deleted)) {
83 return false;
86 /* If a read is in progress, just mark the node as deleted */
87 if (qemu_lockcnt_count(&ctx->list_lock)) {
88 QLIST_INSERT_HEAD_RCU(&ctx->deleted_aio_handlers, node, node_deleted);
89 return false;
91 /* Otherwise, delete it for real. We can't just mark it as
92 * deleted because deleted nodes are only cleaned up while
93 * no one is walking the handlers list.
95 QLIST_SAFE_REMOVE(node, node_poll);
96 QLIST_REMOVE(node, node);
97 return true;
100 void aio_set_fd_handler(AioContext *ctx,
101 int fd,
102 bool is_external,
103 IOHandler *io_read,
104 IOHandler *io_write,
105 AioPollFn *io_poll,
106 IOHandler *io_poll_ready,
107 void *opaque)
109 AioHandler *node;
110 AioHandler *new_node = NULL;
111 bool is_new = false;
112 bool deleted = false;
113 int poll_disable_change;
115 if (io_poll && !io_poll_ready) {
116 io_poll = NULL; /* polling only makes sense if there is a handler */
119 qemu_lockcnt_lock(&ctx->list_lock);
121 node = find_aio_handler(ctx, fd);
123 /* Are we deleting the fd handler? */
124 if (!io_read && !io_write && !io_poll) {
125 if (node == NULL) {
126 qemu_lockcnt_unlock(&ctx->list_lock);
127 return;
129 /* Clean events in order to unregister fd from the ctx epoll. */
130 node->pfd.events = 0;
132 poll_disable_change = -!node->io_poll;
133 } else {
134 poll_disable_change = !io_poll - (node && !node->io_poll);
135 if (node == NULL) {
136 is_new = true;
138 /* Alloc and insert if it's not already there */
139 new_node = g_new0(AioHandler, 1);
141 /* Update handler with latest information */
142 new_node->io_read = io_read;
143 new_node->io_write = io_write;
144 new_node->io_poll = io_poll;
145 new_node->io_poll_ready = io_poll_ready;
146 new_node->opaque = opaque;
147 new_node->is_external = is_external;
149 if (is_new) {
150 new_node->pfd.fd = fd;
151 } else {
152 new_node->pfd = node->pfd;
154 g_source_add_poll(&ctx->source, &new_node->pfd);
156 new_node->pfd.events = (io_read ? G_IO_IN | G_IO_HUP | G_IO_ERR : 0);
157 new_node->pfd.events |= (io_write ? G_IO_OUT | G_IO_ERR : 0);
159 QLIST_INSERT_HEAD_RCU(&ctx->aio_handlers, new_node, node);
162 /* No need to order poll_disable_cnt writes against other updates;
163 * the counter is only used to avoid wasting time and latency on
164 * iterated polling when the system call will be ultimately necessary.
165 * Changing handlers is a rare event, and a little wasted polling until
166 * the aio_notify below is not an issue.
168 qatomic_set(&ctx->poll_disable_cnt,
169 qatomic_read(&ctx->poll_disable_cnt) + poll_disable_change);
171 ctx->fdmon_ops->update(ctx, node, new_node);
172 if (node) {
173 deleted = aio_remove_fd_handler(ctx, node);
175 qemu_lockcnt_unlock(&ctx->list_lock);
176 aio_notify(ctx);
178 if (deleted) {
179 g_free(node);
183 void aio_set_fd_poll(AioContext *ctx, int fd,
184 IOHandler *io_poll_begin,
185 IOHandler *io_poll_end)
187 AioHandler *node = find_aio_handler(ctx, fd);
189 if (!node) {
190 return;
193 node->io_poll_begin = io_poll_begin;
194 node->io_poll_end = io_poll_end;
197 void aio_set_event_notifier(AioContext *ctx,
198 EventNotifier *notifier,
199 bool is_external,
200 EventNotifierHandler *io_read,
201 AioPollFn *io_poll,
202 EventNotifierHandler *io_poll_ready)
204 aio_set_fd_handler(ctx, event_notifier_get_fd(notifier), is_external,
205 (IOHandler *)io_read, NULL, io_poll,
206 (IOHandler *)io_poll_ready, notifier);
209 void aio_set_event_notifier_poll(AioContext *ctx,
210 EventNotifier *notifier,
211 EventNotifierHandler *io_poll_begin,
212 EventNotifierHandler *io_poll_end)
214 aio_set_fd_poll(ctx, event_notifier_get_fd(notifier),
215 (IOHandler *)io_poll_begin,
216 (IOHandler *)io_poll_end);
219 static bool poll_set_started(AioContext *ctx, AioHandlerList *ready_list,
220 bool started)
222 AioHandler *node;
223 bool progress = false;
225 if (started == ctx->poll_started) {
226 return false;
229 ctx->poll_started = started;
231 qemu_lockcnt_inc(&ctx->list_lock);
232 QLIST_FOREACH(node, &ctx->poll_aio_handlers, node_poll) {
233 IOHandler *fn;
235 if (QLIST_IS_INSERTED(node, node_deleted)) {
236 continue;
239 if (started) {
240 fn = node->io_poll_begin;
241 } else {
242 fn = node->io_poll_end;
245 if (fn) {
246 fn(node->opaque);
249 /* Poll one last time in case ->io_poll_end() raced with the event */
250 if (!started && node->io_poll(node->opaque)) {
251 aio_add_poll_ready_handler(ready_list, node);
252 progress = true;
255 qemu_lockcnt_dec(&ctx->list_lock);
257 return progress;
261 bool aio_prepare(AioContext *ctx)
263 AioHandlerList ready_list = QLIST_HEAD_INITIALIZER(ready_list);
265 /* Poll mode cannot be used with glib's event loop, disable it. */
266 poll_set_started(ctx, &ready_list, false);
267 /* TODO what to do with this list? */
269 return false;
272 bool aio_pending(AioContext *ctx)
274 AioHandler *node;
275 bool result = false;
278 * We have to walk very carefully in case aio_set_fd_handler is
279 * called while we're walking.
281 qemu_lockcnt_inc(&ctx->list_lock);
283 QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
284 int revents;
286 /* TODO should this check poll ready? */
287 revents = node->pfd.revents & node->pfd.events;
288 if (revents & (G_IO_IN | G_IO_HUP | G_IO_ERR) && node->io_read &&
289 aio_node_check(ctx, node->is_external)) {
290 result = true;
291 break;
293 if (revents & (G_IO_OUT | G_IO_ERR) && node->io_write &&
294 aio_node_check(ctx, node->is_external)) {
295 result = true;
296 break;
299 qemu_lockcnt_dec(&ctx->list_lock);
301 return result;
304 static void aio_free_deleted_handlers(AioContext *ctx)
306 AioHandler *node;
308 if (QLIST_EMPTY_RCU(&ctx->deleted_aio_handlers)) {
309 return;
311 if (!qemu_lockcnt_dec_if_lock(&ctx->list_lock)) {
312 return; /* we are nested, let the parent do the freeing */
315 while ((node = QLIST_FIRST_RCU(&ctx->deleted_aio_handlers))) {
316 QLIST_REMOVE(node, node);
317 QLIST_REMOVE(node, node_deleted);
318 QLIST_SAFE_REMOVE(node, node_poll);
319 g_free(node);
322 qemu_lockcnt_inc_and_unlock(&ctx->list_lock);
325 static bool aio_dispatch_handler(AioContext *ctx, AioHandler *node)
327 bool progress = false;
328 bool poll_ready;
329 int revents;
331 revents = node->pfd.revents & node->pfd.events;
332 node->pfd.revents = 0;
334 poll_ready = node->poll_ready;
335 node->poll_ready = false;
338 * Start polling AioHandlers when they become ready because activity is
339 * likely to continue. Note that starvation is theoretically possible when
340 * fdmon_supports_polling(), but only until the fd fires for the first
341 * time.
343 if (!QLIST_IS_INSERTED(node, node_deleted) &&
344 !QLIST_IS_INSERTED(node, node_poll) &&
345 node->io_poll) {
346 trace_poll_add(ctx, node, node->pfd.fd, revents);
347 if (ctx->poll_started && node->io_poll_begin) {
348 node->io_poll_begin(node->opaque);
350 QLIST_INSERT_HEAD(&ctx->poll_aio_handlers, node, node_poll);
352 if (!QLIST_IS_INSERTED(node, node_deleted) &&
353 poll_ready && revents == 0 &&
354 aio_node_check(ctx, node->is_external) &&
355 node->io_poll_ready) {
356 node->io_poll_ready(node->opaque);
359 * Return early since revents was zero. aio_notify() does not count as
360 * progress.
362 return node->opaque != &ctx->notifier;
365 if (!QLIST_IS_INSERTED(node, node_deleted) &&
366 (revents & (G_IO_IN | G_IO_HUP | G_IO_ERR)) &&
367 aio_node_check(ctx, node->is_external) &&
368 node->io_read) {
369 node->io_read(node->opaque);
371 /* aio_notify() does not count as progress */
372 if (node->opaque != &ctx->notifier) {
373 progress = true;
376 if (!QLIST_IS_INSERTED(node, node_deleted) &&
377 (revents & (G_IO_OUT | G_IO_ERR)) &&
378 aio_node_check(ctx, node->is_external) &&
379 node->io_write) {
380 node->io_write(node->opaque);
381 progress = true;
384 return progress;
388 * If we have a list of ready handlers then this is more efficient than
389 * scanning all handlers with aio_dispatch_handlers().
391 static bool aio_dispatch_ready_handlers(AioContext *ctx,
392 AioHandlerList *ready_list)
394 bool progress = false;
395 AioHandler *node;
397 while ((node = QLIST_FIRST(ready_list))) {
398 QLIST_REMOVE(node, node_ready);
399 progress = aio_dispatch_handler(ctx, node) || progress;
402 return progress;
405 /* Slower than aio_dispatch_ready_handlers() but only used via glib */
406 static bool aio_dispatch_handlers(AioContext *ctx)
408 AioHandler *node, *tmp;
409 bool progress = false;
411 QLIST_FOREACH_SAFE_RCU(node, &ctx->aio_handlers, node, tmp) {
412 progress = aio_dispatch_handler(ctx, node) || progress;
415 return progress;
418 void aio_dispatch(AioContext *ctx)
420 qemu_lockcnt_inc(&ctx->list_lock);
421 aio_bh_poll(ctx);
422 aio_dispatch_handlers(ctx);
423 aio_free_deleted_handlers(ctx);
424 qemu_lockcnt_dec(&ctx->list_lock);
426 timerlistgroup_run_timers(&ctx->tlg);
429 static bool run_poll_handlers_once(AioContext *ctx,
430 AioHandlerList *ready_list,
431 int64_t now,
432 int64_t *timeout)
434 bool progress = false;
435 AioHandler *node;
436 AioHandler *tmp;
438 QLIST_FOREACH_SAFE(node, &ctx->poll_aio_handlers, node_poll, tmp) {
439 if (aio_node_check(ctx, node->is_external) &&
440 node->io_poll(node->opaque)) {
441 aio_add_poll_ready_handler(ready_list, node);
443 node->poll_idle_timeout = now + POLL_IDLE_INTERVAL_NS;
446 * Polling was successful, exit try_poll_mode immediately
447 * to adjust the next polling time.
449 *timeout = 0;
450 if (node->opaque != &ctx->notifier) {
451 progress = true;
455 /* Caller handles freeing deleted nodes. Don't do it here. */
458 return progress;
461 static bool fdmon_supports_polling(AioContext *ctx)
463 return ctx->fdmon_ops->need_wait != aio_poll_disabled;
466 static bool remove_idle_poll_handlers(AioContext *ctx,
467 AioHandlerList *ready_list,
468 int64_t now)
470 AioHandler *node;
471 AioHandler *tmp;
472 bool progress = false;
475 * File descriptor monitoring implementations without userspace polling
476 * support suffer from starvation when a subset of handlers is polled
477 * because fds will not be processed in a timely fashion. Don't remove
478 * idle poll handlers.
480 if (!fdmon_supports_polling(ctx)) {
481 return false;
484 QLIST_FOREACH_SAFE(node, &ctx->poll_aio_handlers, node_poll, tmp) {
485 if (node->poll_idle_timeout == 0LL) {
486 node->poll_idle_timeout = now + POLL_IDLE_INTERVAL_NS;
487 } else if (now >= node->poll_idle_timeout) {
488 trace_poll_remove(ctx, node, node->pfd.fd);
489 node->poll_idle_timeout = 0LL;
490 QLIST_SAFE_REMOVE(node, node_poll);
491 if (ctx->poll_started && node->io_poll_end) {
492 node->io_poll_end(node->opaque);
495 * Final poll in case ->io_poll_end() races with an event.
496 * Nevermind about re-adding the handler in the rare case where
497 * this causes progress.
499 if (node->io_poll(node->opaque)) {
500 aio_add_poll_ready_handler(ready_list, node);
501 progress = true;
507 return progress;
510 /* run_poll_handlers:
511 * @ctx: the AioContext
512 * @ready_list: the list to place ready handlers on
513 * @max_ns: maximum time to poll for, in nanoseconds
515 * Polls for a given time.
517 * Note that the caller must have incremented ctx->list_lock.
519 * Returns: true if progress was made, false otherwise
521 static bool run_poll_handlers(AioContext *ctx, AioHandlerList *ready_list,
522 int64_t max_ns, int64_t *timeout)
524 bool progress;
525 int64_t start_time, elapsed_time;
527 assert(qemu_lockcnt_count(&ctx->list_lock) > 0);
529 trace_run_poll_handlers_begin(ctx, max_ns, *timeout);
532 * Optimization: ->io_poll() handlers often contain RCU read critical
533 * sections and we therefore see many rcu_read_lock() -> rcu_read_unlock()
534 * -> rcu_read_lock() -> ... sequences with expensive memory
535 * synchronization primitives. Make the entire polling loop an RCU
536 * critical section because nested rcu_read_lock()/rcu_read_unlock() calls
537 * are cheap.
539 RCU_READ_LOCK_GUARD();
541 start_time = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
542 do {
543 progress = run_poll_handlers_once(ctx, ready_list,
544 start_time, timeout);
545 elapsed_time = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - start_time;
546 max_ns = qemu_soonest_timeout(*timeout, max_ns);
547 assert(!(max_ns && progress));
548 } while (elapsed_time < max_ns && !ctx->fdmon_ops->need_wait(ctx));
550 if (remove_idle_poll_handlers(ctx, ready_list,
551 start_time + elapsed_time)) {
552 *timeout = 0;
553 progress = true;
556 /* If time has passed with no successful polling, adjust *timeout to
557 * keep the same ending time.
559 if (*timeout != -1) {
560 *timeout -= MIN(*timeout, elapsed_time);
563 trace_run_poll_handlers_end(ctx, progress, *timeout);
564 return progress;
567 /* try_poll_mode:
568 * @ctx: the AioContext
569 * @ready_list: list to add handlers that need to be run
570 * @timeout: timeout for blocking wait, computed by the caller and updated if
571 * polling succeeds.
573 * Note that the caller must have incremented ctx->list_lock.
575 * Returns: true if progress was made, false otherwise
577 static bool try_poll_mode(AioContext *ctx, AioHandlerList *ready_list,
578 int64_t *timeout)
580 int64_t max_ns;
582 if (QLIST_EMPTY_RCU(&ctx->poll_aio_handlers)) {
583 return false;
586 max_ns = qemu_soonest_timeout(*timeout, ctx->poll_ns);
587 if (max_ns && !ctx->fdmon_ops->need_wait(ctx)) {
588 poll_set_started(ctx, ready_list, true);
590 if (run_poll_handlers(ctx, ready_list, max_ns, timeout)) {
591 return true;
595 if (poll_set_started(ctx, ready_list, false)) {
596 *timeout = 0;
597 return true;
600 return false;
603 bool aio_poll(AioContext *ctx, bool blocking)
605 AioHandlerList ready_list = QLIST_HEAD_INITIALIZER(ready_list);
606 bool progress;
607 bool use_notify_me;
608 int64_t timeout;
609 int64_t start = 0;
612 * There cannot be two concurrent aio_poll calls for the same AioContext (or
613 * an aio_poll concurrent with a GSource prepare/check/dispatch callback).
614 * We rely on this below to avoid slow locked accesses to ctx->notify_me.
616 * aio_poll() may only be called in the AioContext's thread. iohandler_ctx
617 * is special in that it runs in the main thread, but that thread's context
618 * is qemu_aio_context.
620 assert(in_aio_context_home_thread(ctx == iohandler_get_aio_context() ?
621 qemu_get_aio_context() : ctx));
623 qemu_lockcnt_inc(&ctx->list_lock);
625 if (ctx->poll_max_ns) {
626 start = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
629 timeout = blocking ? aio_compute_timeout(ctx) : 0;
630 progress = try_poll_mode(ctx, &ready_list, &timeout);
631 assert(!(timeout && progress));
634 * aio_notify can avoid the expensive event_notifier_set if
635 * everything (file descriptors, bottom halves, timers) will
636 * be re-evaluated before the next blocking poll(). This is
637 * already true when aio_poll is called with blocking == false;
638 * if blocking == true, it is only true after poll() returns,
639 * so disable the optimization now.
641 use_notify_me = timeout != 0;
642 if (use_notify_me) {
643 qatomic_set(&ctx->notify_me, qatomic_read(&ctx->notify_me) + 2);
645 * Write ctx->notify_me before reading ctx->notified. Pairs with
646 * smp_mb in aio_notify().
648 smp_mb();
650 /* Don't block if aio_notify() was called */
651 if (qatomic_read(&ctx->notified)) {
652 timeout = 0;
656 /* If polling is allowed, non-blocking aio_poll does not need the
657 * system call---a single round of run_poll_handlers_once suffices.
659 if (timeout || ctx->fdmon_ops->need_wait(ctx)) {
660 ctx->fdmon_ops->wait(ctx, &ready_list, timeout);
663 if (use_notify_me) {
664 /* Finish the poll before clearing the flag. */
665 qatomic_store_release(&ctx->notify_me,
666 qatomic_read(&ctx->notify_me) - 2);
669 aio_notify_accept(ctx);
671 /* Adjust polling time */
672 if (ctx->poll_max_ns) {
673 int64_t block_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - start;
675 if (block_ns <= ctx->poll_ns) {
676 /* This is the sweet spot, no adjustment needed */
677 } else if (block_ns > ctx->poll_max_ns) {
678 /* We'd have to poll for too long, poll less */
679 int64_t old = ctx->poll_ns;
681 if (ctx->poll_shrink) {
682 ctx->poll_ns /= ctx->poll_shrink;
683 } else {
684 ctx->poll_ns = 0;
687 trace_poll_shrink(ctx, old, ctx->poll_ns);
688 } else if (ctx->poll_ns < ctx->poll_max_ns &&
689 block_ns < ctx->poll_max_ns) {
690 /* There is room to grow, poll longer */
691 int64_t old = ctx->poll_ns;
692 int64_t grow = ctx->poll_grow;
694 if (grow == 0) {
695 grow = 2;
698 if (ctx->poll_ns) {
699 ctx->poll_ns *= grow;
700 } else {
701 ctx->poll_ns = 4000; /* start polling at 4 microseconds */
704 if (ctx->poll_ns > ctx->poll_max_ns) {
705 ctx->poll_ns = ctx->poll_max_ns;
708 trace_poll_grow(ctx, old, ctx->poll_ns);
712 progress |= aio_bh_poll(ctx);
713 progress |= aio_dispatch_ready_handlers(ctx, &ready_list);
715 aio_free_deleted_handlers(ctx);
717 qemu_lockcnt_dec(&ctx->list_lock);
719 progress |= timerlistgroup_run_timers(&ctx->tlg);
721 return progress;
724 void aio_context_setup(AioContext *ctx)
726 ctx->fdmon_ops = &fdmon_poll_ops;
727 ctx->epollfd = -1;
729 /* Use the fastest fd monitoring implementation if available */
730 if (fdmon_io_uring_setup(ctx)) {
731 return;
734 fdmon_epoll_setup(ctx);
737 void aio_context_destroy(AioContext *ctx)
739 fdmon_io_uring_destroy(ctx);
740 fdmon_epoll_disable(ctx);
741 aio_free_deleted_handlers(ctx);
744 void aio_context_use_g_source(AioContext *ctx)
747 * Disable io_uring when the glib main loop is used because it doesn't
748 * support mixed glib/aio_poll() usage. It relies on aio_poll() being
749 * called regularly so that changes to the monitored file descriptors are
750 * submitted, otherwise a list of pending fd handlers builds up.
752 fdmon_io_uring_destroy(ctx);
753 aio_free_deleted_handlers(ctx);
756 void aio_context_set_poll_params(AioContext *ctx, int64_t max_ns,
757 int64_t grow, int64_t shrink, Error **errp)
759 /* No thread synchronization here, it doesn't matter if an incorrect value
760 * is used once.
762 ctx->poll_max_ns = max_ns;
763 ctx->poll_ns = 0;
764 ctx->poll_grow = grow;
765 ctx->poll_shrink = shrink;
767 aio_notify(ctx);
770 void aio_context_set_aio_params(AioContext *ctx, int64_t max_batch,
771 Error **errp)
774 * No thread synchronization here, it doesn't matter if an incorrect value
775 * is used once.
777 ctx->aio_max_batch = max_batch;
779 aio_notify(ctx);