1 /* SPDX-License-Identifier: GPL-2.0-or-later */
3 * Linux io_uring file descriptor monitoring
5 * The Linux io_uring API supports file descriptor monitoring with a few
6 * advantages over existing APIs like poll(2) and epoll(7):
8 * 1. Userspace polling of events is possible because the completion queue (cq
9 * ring) is shared between the kernel and userspace. This allows
10 * applications that rely on userspace polling to also monitor file
11 * descriptors in the same userspace polling loop.
13 * 2. Submission and completion is batched and done together in a single system
14 * call. This minimizes the number of system calls.
16 * 3. File descriptor monitoring is O(1) like epoll(7) so it scales better than
19 * 4. Nanosecond timeouts are supported so it requires fewer syscalls than
22 * This code only monitors file descriptors and does not do asynchronous disk
23 * I/O. Implementing disk I/O efficiently has other requirements and should
24 * use a separate io_uring so it does not make sense to unify the code.
26 * File descriptor monitoring is implemented using the following operations:
28 * 1. IORING_OP_POLL_ADD - adds a file descriptor to be monitored.
29 * 2. IORING_OP_POLL_REMOVE - removes a file descriptor being monitored. When
30 * the poll mask changes for a file descriptor it is first removed and then
31 * re-added with the new poll mask, so this operation is also used as part
32 * of modifying an existing monitored file descriptor.
33 * 3. IORING_OP_TIMEOUT - added every time a blocking syscall is made to wait
34 * for events. This operation self-cancels if another event completes
37 * io_uring calls the submission queue the "sq ring" and the completion queue
38 * the "cq ring". Ring entries are called "sqe" and "cqe", respectively.
40 * The code is structured so that sq/cq rings are only modified within
41 * fdmon_io_uring_wait(). Changes to AioHandlers are made by enqueuing them on
42 * ctx->submit_list so that fdmon_io_uring_wait() can submit IORING_OP_POLL_ADD
43 * and/or IORING_OP_POLL_REMOVE sqes for them.
46 #include "qemu/osdep.h"
48 #include "qemu/rcu_queue.h"
49 #include "aio-posix.h"
52 FDMON_IO_URING_ENTRIES
= 128, /* sq/cq ring size */
54 /* AioHandler::flags */
55 FDMON_IO_URING_PENDING
= (1 << 0),
56 FDMON_IO_URING_ADD
= (1 << 1),
57 FDMON_IO_URING_REMOVE
= (1 << 2),
60 static inline int poll_events_from_pfd(int pfd_events
)
62 return (pfd_events
& G_IO_IN
? POLLIN
: 0) |
63 (pfd_events
& G_IO_OUT
? POLLOUT
: 0) |
64 (pfd_events
& G_IO_HUP
? POLLHUP
: 0) |
65 (pfd_events
& G_IO_ERR
? POLLERR
: 0);
68 static inline int pfd_events_from_poll(int poll_events
)
70 return (poll_events
& POLLIN
? G_IO_IN
: 0) |
71 (poll_events
& POLLOUT
? G_IO_OUT
: 0) |
72 (poll_events
& POLLHUP
? G_IO_HUP
: 0) |
73 (poll_events
& POLLERR
? G_IO_ERR
: 0);
77 * Returns an sqe for submitting a request. Only be called within
78 * fdmon_io_uring_wait().
80 static struct io_uring_sqe
*get_sqe(AioContext
*ctx
)
82 struct io_uring
*ring
= &ctx
->fdmon_io_uring
;
83 struct io_uring_sqe
*sqe
= io_uring_get_sqe(ring
);
90 /* No free sqes left, submit pending sqes first */
92 ret
= io_uring_submit(ring
);
93 } while (ret
== -EINTR
);
96 sqe
= io_uring_get_sqe(ring
);
101 /* Atomically enqueue an AioHandler for sq ring submission */
102 static void enqueue(AioHandlerSList
*head
, AioHandler
*node
, unsigned flags
)
106 old_flags
= qatomic_fetch_or(&node
->flags
, FDMON_IO_URING_PENDING
| flags
);
107 if (!(old_flags
& FDMON_IO_URING_PENDING
)) {
108 QSLIST_INSERT_HEAD_ATOMIC(head
, node
, node_submitted
);
112 /* Dequeue an AioHandler for sq ring submission. Called by fill_sq_ring(). */
113 static AioHandler
*dequeue(AioHandlerSList
*head
, unsigned *flags
)
115 AioHandler
*node
= QSLIST_FIRST(head
);
121 /* Doesn't need to be atomic since fill_sq_ring() moves the list */
122 QSLIST_REMOVE_HEAD(head
, node_submitted
);
125 * Don't clear FDMON_IO_URING_REMOVE. It's sticky so it can serve two
126 * purposes: telling fill_sq_ring() to submit IORING_OP_POLL_REMOVE and
127 * telling process_cqe() to delete the AioHandler when its
128 * IORING_OP_POLL_ADD completes.
130 *flags
= qatomic_fetch_and(&node
->flags
, ~(FDMON_IO_URING_PENDING
|
131 FDMON_IO_URING_ADD
));
135 static void fdmon_io_uring_update(AioContext
*ctx
,
136 AioHandler
*old_node
,
137 AioHandler
*new_node
)
140 enqueue(&ctx
->submit_list
, new_node
, FDMON_IO_URING_ADD
);
145 * Deletion is tricky because IORING_OP_POLL_ADD and
146 * IORING_OP_POLL_REMOVE are async. We need to wait for the original
147 * IORING_OP_POLL_ADD to complete before this handler can be freed
150 * It's possible that the file descriptor becomes ready and the
151 * IORING_OP_POLL_ADD cqe is enqueued before IORING_OP_POLL_REMOVE is
154 * Mark this handler deleted right now but don't place it on
155 * ctx->deleted_aio_handlers yet. Instead, manually fudge the list
156 * entry to make QLIST_IS_INSERTED() think this handler has been
157 * inserted and other code recognizes this AioHandler as deleted.
159 * Once the original IORING_OP_POLL_ADD completes we enqueue the
160 * handler on the real ctx->deleted_aio_handlers list to be freed.
162 assert(!QLIST_IS_INSERTED(old_node
, node_deleted
));
163 old_node
->node_deleted
.le_prev
= &old_node
->node_deleted
.le_next
;
165 enqueue(&ctx
->submit_list
, old_node
, FDMON_IO_URING_REMOVE
);
169 static void add_poll_add_sqe(AioContext
*ctx
, AioHandler
*node
)
171 struct io_uring_sqe
*sqe
= get_sqe(ctx
);
172 int events
= poll_events_from_pfd(node
->pfd
.events
);
174 io_uring_prep_poll_add(sqe
, node
->pfd
.fd
, events
);
175 io_uring_sqe_set_data(sqe
, node
);
178 static void add_poll_remove_sqe(AioContext
*ctx
, AioHandler
*node
)
180 struct io_uring_sqe
*sqe
= get_sqe(ctx
);
182 io_uring_prep_poll_remove(sqe
, node
);
185 /* Add a timeout that self-cancels when another cqe becomes ready */
186 static void add_timeout_sqe(AioContext
*ctx
, int64_t ns
)
188 struct io_uring_sqe
*sqe
;
189 struct __kernel_timespec ts
= {
190 .tv_sec
= ns
/ NANOSECONDS_PER_SECOND
,
191 .tv_nsec
= ns
% NANOSECONDS_PER_SECOND
,
195 io_uring_prep_timeout(sqe
, &ts
, 1, 0);
198 /* Add sqes from ctx->submit_list for submission */
199 static void fill_sq_ring(AioContext
*ctx
)
201 AioHandlerSList submit_list
;
205 QSLIST_MOVE_ATOMIC(&submit_list
, &ctx
->submit_list
);
207 while ((node
= dequeue(&submit_list
, &flags
))) {
208 /* Order matters, just in case both flags were set */
209 if (flags
& FDMON_IO_URING_ADD
) {
210 add_poll_add_sqe(ctx
, node
);
212 if (flags
& FDMON_IO_URING_REMOVE
) {
213 add_poll_remove_sqe(ctx
, node
);
218 /* Returns true if a handler became ready */
219 static bool process_cqe(AioContext
*ctx
,
220 AioHandlerList
*ready_list
,
221 struct io_uring_cqe
*cqe
)
223 AioHandler
*node
= io_uring_cqe_get_data(cqe
);
226 /* poll_timeout and poll_remove have a zero user_data field */
232 * Deletion can only happen when IORING_OP_POLL_ADD completes. If we race
233 * with enqueue() here then we can safely clear the FDMON_IO_URING_REMOVE
234 * bit before IORING_OP_POLL_REMOVE is submitted.
236 flags
= qatomic_fetch_and(&node
->flags
, ~FDMON_IO_URING_REMOVE
);
237 if (flags
& FDMON_IO_URING_REMOVE
) {
238 QLIST_INSERT_HEAD_RCU(&ctx
->deleted_aio_handlers
, node
, node_deleted
);
242 aio_add_ready_handler(ready_list
, node
, pfd_events_from_poll(cqe
->res
));
244 /* IORING_OP_POLL_ADD is one-shot so we must re-arm it */
245 add_poll_add_sqe(ctx
, node
);
249 static int process_cq_ring(AioContext
*ctx
, AioHandlerList
*ready_list
)
251 struct io_uring
*ring
= &ctx
->fdmon_io_uring
;
252 struct io_uring_cqe
*cqe
;
253 unsigned num_cqes
= 0;
254 unsigned num_ready
= 0;
257 io_uring_for_each_cqe(ring
, head
, cqe
) {
258 if (process_cqe(ctx
, ready_list
, cqe
)) {
265 io_uring_cq_advance(ring
, num_cqes
);
269 static int fdmon_io_uring_wait(AioContext
*ctx
, AioHandlerList
*ready_list
,
272 unsigned wait_nr
= 1; /* block until at least one cqe is ready */
275 /* Fall back while external clients are disabled */
276 if (qatomic_read(&ctx
->external_disable_cnt
)) {
277 return fdmon_poll_ops
.wait(ctx
, ready_list
, timeout
);
281 wait_nr
= 0; /* non-blocking */
282 } else if (timeout
> 0) {
283 add_timeout_sqe(ctx
, timeout
);
289 ret
= io_uring_submit_and_wait(&ctx
->fdmon_io_uring
, wait_nr
);
290 } while (ret
== -EINTR
);
294 return process_cq_ring(ctx
, ready_list
);
297 static bool fdmon_io_uring_need_wait(AioContext
*ctx
)
299 /* Have io_uring events completed? */
300 if (io_uring_cq_ready(&ctx
->fdmon_io_uring
)) {
304 /* Are there pending sqes to submit? */
305 if (io_uring_sq_ready(&ctx
->fdmon_io_uring
)) {
309 /* Do we need to process AioHandlers for io_uring changes? */
310 if (!QSLIST_EMPTY_RCU(&ctx
->submit_list
)) {
314 /* Are we falling back to fdmon-poll? */
315 return qatomic_read(&ctx
->external_disable_cnt
);
318 static const FDMonOps fdmon_io_uring_ops
= {
319 .update
= fdmon_io_uring_update
,
320 .wait
= fdmon_io_uring_wait
,
321 .need_wait
= fdmon_io_uring_need_wait
,
324 bool fdmon_io_uring_setup(AioContext
*ctx
)
328 ret
= io_uring_queue_init(FDMON_IO_URING_ENTRIES
, &ctx
->fdmon_io_uring
, 0);
333 QSLIST_INIT(&ctx
->submit_list
);
334 ctx
->fdmon_ops
= &fdmon_io_uring_ops
;
338 void fdmon_io_uring_destroy(AioContext
*ctx
)
340 if (ctx
->fdmon_ops
== &fdmon_io_uring_ops
) {
343 io_uring_queue_exit(&ctx
->fdmon_io_uring
);
345 /* Move handlers due to be removed onto the deleted list */
346 while ((node
= QSLIST_FIRST_RCU(&ctx
->submit_list
))) {
347 unsigned flags
= qatomic_fetch_and(&node
->flags
,
348 ~(FDMON_IO_URING_PENDING
|
350 FDMON_IO_URING_REMOVE
));
352 if (flags
& FDMON_IO_URING_REMOVE
) {
353 QLIST_INSERT_HEAD_RCU(&ctx
->deleted_aio_handlers
, node
, node_deleted
);
356 QSLIST_REMOVE_HEAD_RCU(&ctx
->submit_list
, node_submitted
);
359 ctx
->fdmon_ops
= &fdmon_poll_ops
;