aio-posix.c

   1 /*
   2  * QEMU aio implementation
   3  *
   4  * Copyright IBM, Corp. 2008
   5  *
   6  * Authors:
   7  *  Anthony Liguori   <aliguori@us.ibm.com>
   8  *
   9  * This work is licensed under the terms of the GNU GPL, version 2.  See
  10  * the COPYING file in the top-level directory.
  11  *
  12  * Contributions after 2012-01-13 are licensed under the terms of the
  13  * GNU GPL, version 2 or (at your option) any later version.
  14  */
  15
  16 #include "qemu-common.h"
  17 #include "block/block.h"
  18 #include "qemu/queue.h"
  19 #include "qemu/sockets.h"
  20
  21 struct AioHandler
  22 {
  23     GPollFD pfd;
  24     IOHandler *io_read;
  25     IOHandler *io_write;
  26     int deleted;
  27     void *opaque;
  28     QLIST_ENTRY(AioHandler) node;
  29 };
  30
  31 static AioHandler *find_aio_handler(AioContext *ctx, int fd)
  32 {
  33     AioHandler *node;
  34
  35     QLIST_FOREACH(node, &ctx->aio_handlers, node) {
  36         if (node->pfd.fd == fd)
  37             if (!node->deleted)
  38                 return node;
  39     }
  40
  41     return NULL;
  42 }
  43
  44 void aio_set_fd_handler(AioContext *ctx,
  45                         int fd,
  46                         IOHandler *io_read,
  47                         IOHandler *io_write,
  48                         void *opaque)
  49 {
  50     AioHandler *node;
  51
  52     node = find_aio_handler(ctx, fd);
  53
  54     /* Are we deleting the fd handler? */
  55     if (!io_read && !io_write) {
  56         if (node) {
  57             g_source_remove_poll(&ctx->source, &node->pfd);
  58
  59             /* If the lock is held, just mark the node as deleted */
  60             if (ctx->walking_handlers) {
  61                 node->deleted = 1;
  62                 node->pfd.revents = 0;
  63             } else {
  64                 /* Otherwise, delete it for real.  We can't just mark it as
  65                  * deleted because deleted nodes are only cleaned up after
  66                  * releasing the walking_handlers lock.
  67                  */
  68                 QLIST_REMOVE(node, node);
  69                 g_free(node);
  70             }
  71         }
  72     } else {
  73         if (node == NULL) {
  74             /* Alloc and insert if it's not already there */
  75             node = g_new0(AioHandler, 1);
  76             node->pfd.fd = fd;
  77             QLIST_INSERT_HEAD(&ctx->aio_handlers, node, node);
  78
  79             g_source_add_poll(&ctx->source, &node->pfd);
  80         }
  81         /* Update handler with latest information */
  82         node->io_read = io_read;
  83         node->io_write = io_write;
  84         node->opaque = opaque;
  85
  86         node->pfd.events = (io_read ? G_IO_IN | G_IO_HUP | G_IO_ERR : 0);
  87         node->pfd.events |= (io_write ? G_IO_OUT | G_IO_ERR : 0);
  88     }
  89
  90     aio_notify(ctx);
  91 }
  92
  93 void aio_set_event_notifier(AioContext *ctx,
  94                             EventNotifier *notifier,
  95                             EventNotifierHandler *io_read)
  96 {
  97     aio_set_fd_handler(ctx, event_notifier_get_fd(notifier),
  98                        (IOHandler *)io_read, NULL, notifier);
  99 }
 100
 101 bool aio_prepare(AioContext *ctx)
 102 {
 103     return false;
 104 }
 105
 106 bool aio_pending(AioContext *ctx)
 107 {
 108     AioHandler *node;
 109
 110     QLIST_FOREACH(node, &ctx->aio_handlers, node) {
 111         int revents;
 112
 113         revents = node->pfd.revents & node->pfd.events;
 114         if (revents & (G_IO_IN | G_IO_HUP | G_IO_ERR) && node->io_read) {
 115             return true;
 116         }
 117         if (revents & (G_IO_OUT | G_IO_ERR) && node->io_write) {
 118             return true;
 119         }
 120     }
 121
 122     return false;
 123 }
 124
 125 bool aio_dispatch(AioContext *ctx)
 126 {
 127     AioHandler *node;
 128     bool progress = false;
 129
 130     /*
 131      * If there are callbacks left that have been queued, we need to call them.
 132      * Do not call select in this case, because it is possible that the caller
 133      * does not need a complete flush (as is the case for aio_poll loops).
 134      */
 135     if (aio_bh_poll(ctx)) {
 136         progress = true;
 137     }
 138
 139     /*
 140      * We have to walk very carefully in case aio_set_fd_handler is
 141      * called while we're walking.
 142      */
 143     node = QLIST_FIRST(&ctx->aio_handlers);
 144     while (node) {
 145         AioHandler *tmp;
 146         int revents;
 147
 148         ctx->walking_handlers++;
 149
 150         revents = node->pfd.revents & node->pfd.events;
 151         node->pfd.revents = 0;
 152
 153         if (!node->deleted &&
 154             (revents & (G_IO_IN | G_IO_HUP | G_IO_ERR)) &&
 155             node->io_read) {
 156             node->io_read(node->opaque);
 157
 158             /* aio_notify() does not count as progress */
 159             if (node->opaque != &ctx->notifier) {
 160                 progress = true;
 161             }
 162         }
 163         if (!node->deleted &&
 164             (revents & (G_IO_OUT | G_IO_ERR)) &&
 165             node->io_write) {
 166             node->io_write(node->opaque);
 167             progress = true;
 168         }
 169
 170         tmp = node;
 171         node = QLIST_NEXT(node, node);
 172
 173         ctx->walking_handlers--;
 174
 175         if (!ctx->walking_handlers && tmp->deleted) {
 176             QLIST_REMOVE(tmp, node);
 177             g_free(tmp);
 178         }
 179     }
 180
 181     /* Run our timers */
 182     progress |= timerlistgroup_run_timers(&ctx->tlg);
 183
 184     return progress;
 185 }
 186
 187 /* These thread-local variables are used only in a small part of aio_poll
 188  * around the call to the poll() system call.  In particular they are not
 189  * used while aio_poll is performing callbacks, which makes it much easier
 190  * to think about reentrancy!
 191  *
 192  * Stack-allocated arrays would be perfect but they have size limitations;
 193  * heap allocation is expensive enough that we want to reuse arrays across
 194  * calls to aio_poll().  And because poll() has to be called without holding
 195  * any lock, the arrays cannot be stored in AioContext.  Thread-local data
 196  * has none of the disadvantages of these three options.
 197  */
 198 static __thread GPollFD *pollfds;
 199 static __thread AioHandler **nodes;
 200 static __thread unsigned npfd, nalloc;
 201 static __thread Notifier pollfds_cleanup_notifier;
 202
 203 static void pollfds_cleanup(Notifier *n, void *unused)
 204 {
 205     g_assert(npfd == 0);
 206     g_free(pollfds);
 207     g_free(nodes);
 208     nalloc = 0;
 209 }
 210
 211 static void add_pollfd(AioHandler *node)
 212 {
 213     if (npfd == nalloc) {
 214         if (nalloc == 0) {
 215             pollfds_cleanup_notifier.notify = pollfds_cleanup;
 216             qemu_thread_atexit_add(&pollfds_cleanup_notifier);
 217             nalloc = 8;
 218         } else {
 219             g_assert(nalloc <= INT_MAX);
 220             nalloc *= 2;
 221         }
 222         pollfds = g_renew(GPollFD, pollfds, nalloc);
 223         nodes = g_renew(AioHandler *, nodes, nalloc);
 224     }
 225     nodes[npfd] = node;
 226     pollfds[npfd] = (GPollFD) {
 227         .fd = node->pfd.fd,
 228         .events = node->pfd.events,
 229     };
 230     npfd++;
 231 }
 232
 233 bool aio_poll(AioContext *ctx, bool blocking)
 234 {
 235     AioHandler *node;
 236     int i, ret;
 237     bool progress;
 238     int64_t timeout;
 239
 240     aio_context_acquire(ctx);
 241     progress = false;
 242
 243     /* aio_notify can avoid the expensive event_notifier_set if
 244      * everything (file descriptors, bottom halves, timers) will
 245      * be re-evaluated before the next blocking poll().  This is
 246      * already true when aio_poll is called with blocking == false;
 247      * if blocking == true, it is only true after poll() returns,
 248      * so disable the optimization now.
 249      */
 250     if (blocking) {
 251         atomic_add(&ctx->notify_me, 2);
 252     }
 253
 254     ctx->walking_handlers++;
 255
 256     assert(npfd == 0);
 257
 258     /* fill pollfds */
 259     QLIST_FOREACH(node, &ctx->aio_handlers, node) {
 260         if (!node->deleted && node->pfd.events) {
 261             add_pollfd(node);
 262         }
 263     }
 264
 265     timeout = blocking ? aio_compute_timeout(ctx) : 0;
 266
 267     /* wait until next event */
 268     if (timeout) {
 269         aio_context_release(ctx);
 270     }
 271     ret = qemu_poll_ns((GPollFD *)pollfds, npfd, timeout);
 272     if (blocking) {
 273         atomic_sub(&ctx->notify_me, 2);
 274     }
 275     if (timeout) {
 276         aio_context_acquire(ctx);
 277     }
 278
 279     aio_notify_accept(ctx);
 280
 281     /* if we have any readable fds, dispatch event */
 282     if (ret > 0) {
 283         for (i = 0; i < npfd; i++) {
 284             nodes[i]->pfd.revents = pollfds[i].revents;
 285         }
 286     }
 287
 288     npfd = 0;
 289     ctx->walking_handlers--;
 290
 291     /* Run dispatch even if there were no readable fds to run timers */
 292     if (aio_dispatch(ctx)) {
 293         progress = true;
 294     }
 295
 296     aio_context_release(ctx);
 297
 298     return progress;
 299 }