util/fdmon-io_uring.c

   1 /* SPDX-License-Identifier: GPL-2.0-or-later */
   2 /*
   3  * Linux io_uring file descriptor monitoring
   4  *
   5  * The Linux io_uring API supports file descriptor monitoring with a few
   6  * advantages over existing APIs like poll(2) and epoll(7):
   7  *
   8  * 1. Userspace polling of events is possible because the completion queue (cq
   9  *    ring) is shared between the kernel and userspace.  This allows
  10  *    applications that rely on userspace polling to also monitor file
  11  *    descriptors in the same userspace polling loop.
  12  *
  13  * 2. Submission and completion is batched and done together in a single system
  14  *    call.  This minimizes the number of system calls.
  15  *
  16  * 3. File descriptor monitoring is O(1) like epoll(7) so it scales better than
  17  *    poll(2).
  18  *
  19  * 4. Nanosecond timeouts are supported so it requires fewer syscalls than
  20  *    epoll(7).
  21  *
  22  * This code only monitors file descriptors and does not do asynchronous disk
  23  * I/O.  Implementing disk I/O efficiently has other requirements and should
  24  * use a separate io_uring so it does not make sense to unify the code.
  25  *
  26  * File descriptor monitoring is implemented using the following operations:
  27  *
  28  * 1. IORING_OP_POLL_ADD - adds a file descriptor to be monitored.
  29  * 2. IORING_OP_POLL_REMOVE - removes a file descriptor being monitored.  When
  30  *    the poll mask changes for a file descriptor it is first removed and then
  31  *    re-added with the new poll mask, so this operation is also used as part
  32  *    of modifying an existing monitored file descriptor.
  33  * 3. IORING_OP_TIMEOUT - added every time a blocking syscall is made to wait
  34  *    for events.  This operation self-cancels if another event completes
  35  *    before the timeout.
  36  *
  37  * io_uring calls the submission queue the "sq ring" and the completion queue
  38  * the "cq ring".  Ring entries are called "sqe" and "cqe", respectively.
  39  *
  40  * The code is structured so that sq/cq rings are only modified within
  41  * fdmon_io_uring_wait().  Changes to AioHandlers are made by enqueuing them on
  42  * ctx->submit_list so that fdmon_io_uring_wait() can submit IORING_OP_POLL_ADD
  43  * and/or IORING_OP_POLL_REMOVE sqes for them.
  44  */
  45
  46 #include "qemu/osdep.h"
  47 #include <poll.h>
  48 #include "qemu/rcu_queue.h"
  49 #include "aio-posix.h"
  50
  51 enum {
  52     FDMON_IO_URING_ENTRIES  = 128, /* sq/cq ring size */
  53
  54     /* AioHandler::flags */
  55     FDMON_IO_URING_PENDING  = (1 << 0),
  56     FDMON_IO_URING_ADD      = (1 << 1),
  57     FDMON_IO_URING_REMOVE   = (1 << 2),
  58 };
  59
  60 static inline int poll_events_from_pfd(int pfd_events)
  61 {
  62     return (pfd_events & G_IO_IN ? POLLIN : 0) |
  63            (pfd_events & G_IO_OUT ? POLLOUT : 0) |
  64            (pfd_events & G_IO_HUP ? POLLHUP : 0) |
  65            (pfd_events & G_IO_ERR ? POLLERR : 0);
  66 }
  67
  68 static inline int pfd_events_from_poll(int poll_events)
  69 {
  70     return (poll_events & POLLIN ? G_IO_IN : 0) |
  71            (poll_events & POLLOUT ? G_IO_OUT : 0) |
  72            (poll_events & POLLHUP ? G_IO_HUP : 0) |
  73            (poll_events & POLLERR ? G_IO_ERR : 0);
  74 }
  75
  76 /*
  77  * Returns an sqe for submitting a request.  Only be called within
  78  * fdmon_io_uring_wait().
  79  */
  80 static struct io_uring_sqe *get_sqe(AioContext *ctx)
  81 {
  82     struct io_uring *ring = &ctx->fdmon_io_uring;
  83     struct io_uring_sqe *sqe = io_uring_get_sqe(ring);
  84     int ret;
  85
  86     if (likely(sqe)) {
  87         return sqe;
  88     }
  89
  90     /* No free sqes left, submit pending sqes first */
  91     ret = io_uring_submit(ring);
  92     assert(ret > 1);
  93     sqe = io_uring_get_sqe(ring);
  94     assert(sqe);
  95     return sqe;
  96 }
  97
  98 /* Atomically enqueue an AioHandler for sq ring submission */
  99 static void enqueue(AioHandlerSList *head, AioHandler *node, unsigned flags)
 100 {
 101     unsigned old_flags;
 102
 103     old_flags = atomic_fetch_or(&node->flags, FDMON_IO_URING_PENDING | flags);
 104     if (!(old_flags & FDMON_IO_URING_PENDING)) {
 105         QSLIST_INSERT_HEAD_ATOMIC(head, node, node_submitted);
 106     }
 107 }
 108
 109 /* Dequeue an AioHandler for sq ring submission.  Called by fill_sq_ring(). */
 110 static AioHandler *dequeue(AioHandlerSList *head, unsigned *flags)
 111 {
 112     AioHandler *node = QSLIST_FIRST(head);
 113
 114     if (!node) {
 115         return NULL;
 116     }
 117
 118     /* Doesn't need to be atomic since fill_sq_ring() moves the list */
 119     QSLIST_REMOVE_HEAD(head, node_submitted);
 120
 121     /*
 122      * Don't clear FDMON_IO_URING_REMOVE.  It's sticky so it can serve two
 123      * purposes: telling fill_sq_ring() to submit IORING_OP_POLL_REMOVE and
 124      * telling process_cqe() to delete the AioHandler when its
 125      * IORING_OP_POLL_ADD completes.
 126      */
 127     *flags = atomic_fetch_and(&node->flags, ~(FDMON_IO_URING_PENDING |
 128                                               FDMON_IO_URING_ADD));
 129     return node;
 130 }
 131
 132 static void fdmon_io_uring_update(AioContext *ctx,
 133                                   AioHandler *old_node,
 134                                   AioHandler *new_node)
 135 {
 136     if (new_node) {
 137         enqueue(&ctx->submit_list, new_node, FDMON_IO_URING_ADD);
 138     }
 139
 140     if (old_node) {
 141         /*
 142          * Deletion is tricky because IORING_OP_POLL_ADD and
 143          * IORING_OP_POLL_REMOVE are async.  We need to wait for the original
 144          * IORING_OP_POLL_ADD to complete before this handler can be freed
 145          * safely.
 146          *
 147          * It's possible that the file descriptor becomes ready and the
 148          * IORING_OP_POLL_ADD cqe is enqueued before IORING_OP_POLL_REMOVE is
 149          * submitted, too.
 150          *
 151          * Mark this handler deleted right now but don't place it on
 152          * ctx->deleted_aio_handlers yet.  Instead, manually fudge the list
 153          * entry to make QLIST_IS_INSERTED() think this handler has been
 154          * inserted and other code recognizes this AioHandler as deleted.
 155          *
 156          * Once the original IORING_OP_POLL_ADD completes we enqueue the
 157          * handler on the real ctx->deleted_aio_handlers list to be freed.
 158          */
 159         assert(!QLIST_IS_INSERTED(old_node, node_deleted));
 160         old_node->node_deleted.le_prev = &old_node->node_deleted.le_next;
 161
 162         enqueue(&ctx->submit_list, old_node, FDMON_IO_URING_REMOVE);
 163     }
 164 }
 165
 166 static void add_poll_add_sqe(AioContext *ctx, AioHandler *node)
 167 {
 168     struct io_uring_sqe *sqe = get_sqe(ctx);
 169     int events = poll_events_from_pfd(node->pfd.events);
 170
 171     io_uring_prep_poll_add(sqe, node->pfd.fd, events);
 172     io_uring_sqe_set_data(sqe, node);
 173 }
 174
 175 static void add_poll_remove_sqe(AioContext *ctx, AioHandler *node)
 176 {
 177     struct io_uring_sqe *sqe = get_sqe(ctx);
 178
 179     io_uring_prep_poll_remove(sqe, node);
 180 }
 181
 182 /* Add a timeout that self-cancels when another cqe becomes ready */
 183 static void add_timeout_sqe(AioContext *ctx, int64_t ns)
 184 {
 185     struct io_uring_sqe *sqe;
 186     struct __kernel_timespec ts = {
 187         .tv_sec = ns / NANOSECONDS_PER_SECOND,
 188         .tv_nsec = ns % NANOSECONDS_PER_SECOND,
 189     };
 190
 191     sqe = get_sqe(ctx);
 192     io_uring_prep_timeout(sqe, &ts, 1, 0);
 193 }
 194
 195 /* Add sqes from ctx->submit_list for submission */
 196 static void fill_sq_ring(AioContext *ctx)
 197 {
 198     AioHandlerSList submit_list;
 199     AioHandler *node;
 200     unsigned flags;
 201
 202     QSLIST_MOVE_ATOMIC(&submit_list, &ctx->submit_list);
 203
 204     while ((node = dequeue(&submit_list, &flags))) {
 205         /* Order matters, just in case both flags were set */
 206         if (flags & FDMON_IO_URING_ADD) {
 207             add_poll_add_sqe(ctx, node);
 208         }
 209         if (flags & FDMON_IO_URING_REMOVE) {
 210             add_poll_remove_sqe(ctx, node);
 211         }
 212     }
 213 }
 214
 215 /* Returns true if a handler became ready */
 216 static bool process_cqe(AioContext *ctx,
 217                         AioHandlerList *ready_list,
 218                         struct io_uring_cqe *cqe)
 219 {
 220     AioHandler *node = io_uring_cqe_get_data(cqe);
 221     unsigned flags;
 222
 223     /* poll_timeout and poll_remove have a zero user_data field */
 224     if (!node) {
 225         return false;
 226     }
 227
 228     /*
 229      * Deletion can only happen when IORING_OP_POLL_ADD completes.  If we race
 230      * with enqueue() here then we can safely clear the FDMON_IO_URING_REMOVE
 231      * bit before IORING_OP_POLL_REMOVE is submitted.
 232      */
 233     flags = atomic_fetch_and(&node->flags, ~FDMON_IO_URING_REMOVE);
 234     if (flags & FDMON_IO_URING_REMOVE) {
 235         QLIST_INSERT_HEAD_RCU(&ctx->deleted_aio_handlers, node, node_deleted);
 236         return false;
 237     }
 238
 239     aio_add_ready_handler(ready_list, node, pfd_events_from_poll(cqe->res));
 240
 241     /* IORING_OP_POLL_ADD is one-shot so we must re-arm it */
 242     add_poll_add_sqe(ctx, node);
 243     return true;
 244 }
 245
 246 static int process_cq_ring(AioContext *ctx, AioHandlerList *ready_list)
 247 {
 248     struct io_uring *ring = &ctx->fdmon_io_uring;
 249     struct io_uring_cqe *cqe;
 250     unsigned num_cqes = 0;
 251     unsigned num_ready = 0;
 252     unsigned head;
 253
 254     io_uring_for_each_cqe(ring, head, cqe) {
 255         if (process_cqe(ctx, ready_list, cqe)) {
 256             num_ready++;
 257         }
 258
 259         num_cqes++;
 260     }
 261
 262     io_uring_cq_advance(ring, num_cqes);
 263     return num_ready;
 264 }
 265
 266 static int fdmon_io_uring_wait(AioContext *ctx, AioHandlerList *ready_list,
 267                                int64_t timeout)
 268 {
 269     unsigned wait_nr = 1; /* block until at least one cqe is ready */
 270     int ret;
 271
 272     /* Fall back while external clients are disabled */
 273     if (atomic_read(&ctx->external_disable_cnt)) {
 274         return fdmon_poll_ops.wait(ctx, ready_list, timeout);
 275     }
 276
 277     if (timeout == 0) {
 278         wait_nr = 0; /* non-blocking */
 279     } else if (timeout > 0) {
 280         add_timeout_sqe(ctx, timeout);
 281     }
 282
 283     fill_sq_ring(ctx);
 284
 285     ret = io_uring_submit_and_wait(&ctx->fdmon_io_uring, wait_nr);
 286     assert(ret >= 0);
 287
 288     return process_cq_ring(ctx, ready_list);
 289 }
 290
 291 static bool fdmon_io_uring_need_wait(AioContext *ctx)
 292 {
 293     return io_uring_cq_ready(&ctx->fdmon_io_uring);
 294 }
 295
 296 static const FDMonOps fdmon_io_uring_ops = {
 297     .update = fdmon_io_uring_update,
 298     .wait = fdmon_io_uring_wait,
 299     .need_wait = fdmon_io_uring_need_wait,
 300 };
 301
 302 bool fdmon_io_uring_setup(AioContext *ctx)
 303 {
 304     int ret;
 305
 306     ret = io_uring_queue_init(FDMON_IO_URING_ENTRIES, &ctx->fdmon_io_uring, 0);
 307     if (ret != 0) {
 308         return false;
 309     }
 310
 311     QSLIST_INIT(&ctx->submit_list);
 312     ctx->fdmon_ops = &fdmon_io_uring_ops;
 313     return true;
 314 }
 315
 316 void fdmon_io_uring_destroy(AioContext *ctx)
 317 {
 318     if (ctx->fdmon_ops == &fdmon_io_uring_ops) {
 319         AioHandler *node;
 320
 321         io_uring_queue_exit(&ctx->fdmon_io_uring);
 322
 323         /* No need to submit these anymore, just free them. */
 324         while ((node = QSLIST_FIRST_RCU(&ctx->submit_list))) {
 325             QSLIST_REMOVE_HEAD_RCU(&ctx->submit_list, node_submitted);
 326             QLIST_REMOVE(node, node);
 327             g_free(node);
 328         }
 329
 330         ctx->fdmon_ops = &fdmon_poll_ops;
 331     }
 332 }