r20989: don't mark epoll as set until after the io_submit() succeeds
[Samba.git] / source / lib / events / events_aio.c
blobcd3c32a9b8fef184588dfac62ab8b0c5062389bd
1 /*
2 Unix SMB/CIFS implementation.
4 main select loop and event handling - aio/epoll hybrid implementation
6 Copyright (C) Andrew Tridgell 2006
8 based on events_standard.c
10 This program is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 2 of the License, or
13 (at your option) any later version.
15 This program is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
20 You should have received a copy of the GNU General Public License
21 along with this program; if not, write to the Free Software
22 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25 this is a very strange beast. The Linux AIO implementation doesn't
26 yet integrate properly with epoll, but there is a kernel patch that
27 allows the aio wait primitives to be used to wait for epoll events,
28 and this can be used to give us a unified event system incorporating
29 both aio events and epoll events
31 this is _very_ experimental code
34 #include "includes.h"
35 #include "system/filesys.h"
36 #include "lib/util/dlinklist.h"
37 #include "lib/events/events.h"
38 #include "lib/events/events_internal.h"
39 #include <sys/epoll.h>
40 #include <libaio.h>
42 #define MAX_AIO_QUEUE_DEPTH 100
43 #ifndef IOCB_CMD_EPOLL_WAIT
44 #define IOCB_CMD_EPOLL_WAIT 9
45 #endif
47 struct aio_event_context {
48 /* a pointer back to the generic event_context */
49 struct event_context *ev;
51 /* number of registered fd event handlers */
52 int num_fd_events;
54 uint32_t destruction_count;
56 io_context_t ioctx;
58 struct epoll_event epevent[MAX_AIO_QUEUE_DEPTH];
60 struct iocb *epoll_iocb;
62 int epoll_fd;
63 int is_epoll_set;
66 struct aio_event {
67 struct event_context *event_ctx;
68 struct iocb iocb;
69 void *private_data;
70 event_aio_handler_t handler;
74 map from EVENT_FD_* to EPOLLIN/EPOLLOUT
76 static uint32_t epoll_map_flags(uint16_t flags)
78 uint32_t ret = 0;
79 if (flags & EVENT_FD_READ) ret |= (EPOLLIN | EPOLLERR | EPOLLHUP);
80 if (flags & EVENT_FD_WRITE) ret |= (EPOLLOUT | EPOLLERR | EPOLLHUP);
81 return ret;
85 free the epoll fd
87 static int aio_ctx_destructor(struct aio_event_context *aio_ev)
89 io_queue_release(aio_ev->ioctx);
90 close(aio_ev->epoll_fd);
91 aio_ev->epoll_fd = -1;
92 return 0;
95 #define EPOLL_ADDITIONAL_FD_FLAG_HAS_EVENT (1<<0)
96 #define EPOLL_ADDITIONAL_FD_FLAG_REPORT_ERROR (1<<1)
97 #define EPOLL_ADDITIONAL_FD_FLAG_GOT_ERROR (1<<2)
100 add the epoll event to the given fd_event
102 static void epoll_add_event(struct aio_event_context *aio_ev, struct fd_event *fde)
104 struct epoll_event event;
105 if (aio_ev->epoll_fd == -1) return;
107 fde->additional_flags &= ~EPOLL_ADDITIONAL_FD_FLAG_REPORT_ERROR;
109 /* if we don't want events yet, don't add an aio_event */
110 if (fde->flags == 0) return;
112 ZERO_STRUCT(event);
113 event.events = epoll_map_flags(fde->flags);
114 event.data.ptr = fde;
115 epoll_ctl(aio_ev->epoll_fd, EPOLL_CTL_ADD, fde->fd, &event);
116 fde->additional_flags |= EPOLL_ADDITIONAL_FD_FLAG_HAS_EVENT;
118 /* only if we want to read we want to tell the event handler about errors */
119 if (fde->flags & EVENT_FD_READ) {
120 fde->additional_flags |= EPOLL_ADDITIONAL_FD_FLAG_REPORT_ERROR;
125 delete the epoll event for given fd_event
127 static void epoll_del_event(struct aio_event_context *aio_ev, struct fd_event *fde)
129 struct epoll_event event;
130 if (aio_ev->epoll_fd == -1) return;
132 fde->additional_flags &= ~EPOLL_ADDITIONAL_FD_FLAG_REPORT_ERROR;
134 /* if there's no aio_event, we don't need to delete it */
135 if (!(fde->additional_flags & EPOLL_ADDITIONAL_FD_FLAG_HAS_EVENT)) return;
137 ZERO_STRUCT(event);
138 event.events = epoll_map_flags(fde->flags);
139 event.data.ptr = fde;
140 epoll_ctl(aio_ev->epoll_fd, EPOLL_CTL_DEL, fde->fd, &event);
142 fde->additional_flags &= ~EPOLL_ADDITIONAL_FD_FLAG_HAS_EVENT;
146 change the epoll event to the given fd_event
148 static void epoll_mod_event(struct aio_event_context *aio_ev, struct fd_event *fde)
150 struct epoll_event event;
151 if (aio_ev->epoll_fd == -1) return;
153 fde->additional_flags &= ~EPOLL_ADDITIONAL_FD_FLAG_REPORT_ERROR;
155 ZERO_STRUCT(event);
156 event.events = epoll_map_flags(fde->flags);
157 event.data.ptr = fde;
158 epoll_ctl(aio_ev->epoll_fd, EPOLL_CTL_MOD, fde->fd, &event);
160 /* only if we want to read we want to tell the event handler about errors */
161 if (fde->flags & EVENT_FD_READ) {
162 fde->additional_flags |= EPOLL_ADDITIONAL_FD_FLAG_REPORT_ERROR;
166 static void epoll_change_event(struct aio_event_context *aio_ev, struct fd_event *fde)
168 BOOL got_error = (fde->additional_flags & EPOLL_ADDITIONAL_FD_FLAG_GOT_ERROR);
169 BOOL want_read = (fde->flags & EVENT_FD_READ);
170 BOOL want_write= (fde->flags & EVENT_FD_WRITE);
172 if (aio_ev->epoll_fd == -1) return;
174 fde->additional_flags &= ~EPOLL_ADDITIONAL_FD_FLAG_REPORT_ERROR;
176 /* there's already an event */
177 if (fde->additional_flags & EPOLL_ADDITIONAL_FD_FLAG_HAS_EVENT) {
178 if (want_read || (want_write && !got_error)) {
179 epoll_mod_event(aio_ev, fde);
180 return;
182 epoll_del_event(aio_ev, fde);
183 return;
186 /* there's no aio_event attached to the fde */
187 if (want_read || (want_write && !got_error)) {
188 epoll_add_event(aio_ev, fde);
189 return;
193 static int setup_epoll_wait(struct aio_event_context *aio_ev)
195 if (aio_ev->is_epoll_set) {
196 return 0;
198 memset(aio_ev->epoll_iocb, 0, sizeof(*aio_ev->epoll_iocb));
199 aio_ev->epoll_iocb->aio_fildes = aio_ev->epoll_fd;
200 aio_ev->epoll_iocb->aio_lio_opcode = IOCB_CMD_EPOLL_WAIT;
201 aio_ev->epoll_iocb->aio_reqprio = 0;
203 aio_ev->epoll_iocb->u.c.nbytes = MAX_AIO_QUEUE_DEPTH;
204 aio_ev->epoll_iocb->u.c.offset = -1;
205 aio_ev->epoll_iocb->u.c.buf = aio_ev->epevent;
207 if (io_submit(aio_ev->ioctx, 1, &aio_ev->epoll_iocb) != 1) {
208 return -1;
210 aio_ev->is_epoll_set = 1;
212 return 0;
217 event loop handling using aio/epoll hybrid
219 static int aio_event_loop(struct aio_event_context *aio_ev, struct timeval *tvalp)
221 int ret, i;
222 uint32_t destruction_count = aio_ev->destruction_count;
223 struct timespec timeout;
224 struct io_event events[8];
226 if (aio_ev->epoll_fd == -1) return -1;
228 if (aio_ev->ev->num_signal_handlers &&
229 common_event_check_signal(aio_ev->ev)) {
230 return 0;
233 if (tvalp) {
234 timeout.tv_sec = tvalp->tv_sec;
235 timeout.tv_nsec = tvalp->tv_usec;
236 timeout.tv_nsec *= 1000;
239 if (setup_epoll_wait(aio_ev) < 0)
240 return -1;
242 ret = io_getevents(aio_ev->ioctx, 1, 8,
243 events, tvalp?&timeout:NULL);
245 if (ret == -EINTR) {
246 if (aio_ev->ev->num_signal_handlers) {
247 common_event_check_signal(aio_ev->ev);
249 return 0;
252 if (ret == 0 && tvalp) {
253 common_event_loop_timer(aio_ev->ev);
254 return 0;
257 for (i=0;i<ret;i++) {
258 struct io_event *event = &events[i];
259 struct iocb *finished = event->obj;
261 switch (finished->aio_lio_opcode) {
262 case IO_CMD_PWRITE:
263 case IO_CMD_PREAD: {
264 struct aio_event *ae = talloc_get_type(finished->data,
265 struct aio_event);
266 if (ae) {
267 talloc_set_destructor(ae, NULL);
268 ae->handler(ae->event_ctx, ae,
269 event->res, ae->private_data);
270 talloc_free(ae);
272 break;
274 case IOCB_CMD_EPOLL_WAIT: {
275 struct epoll_event *ep = (struct epoll_event *)finished->u.c.buf;
276 struct fd_event *fde;
277 uint16_t flags = 0;
278 int j;
280 aio_ev->is_epoll_set = 0;
282 for (j=0; j<event->res; j++, ep++) {
283 fde = talloc_get_type(ep->data.ptr,
284 struct fd_event);
285 if (fde == NULL) {
286 return -1;
288 if (ep->events & (EPOLLHUP|EPOLLERR)) {
289 fde->additional_flags |= EPOLL_ADDITIONAL_FD_FLAG_GOT_ERROR;
290 if (!(fde->additional_flags & EPOLL_ADDITIONAL_FD_FLAG_REPORT_ERROR)) {
291 epoll_del_event(aio_ev, fde);
292 continue;
294 flags |= EVENT_FD_READ;
296 if (ep->events & EPOLLIN) flags |= EVENT_FD_READ;
297 if (ep->events & EPOLLOUT) flags |= EVENT_FD_WRITE;
298 if (flags) {
299 fde->handler(aio_ev->ev, fde, flags, fde->private_data);
302 break;
305 if (destruction_count != aio_ev->destruction_count) {
306 return 0;
310 return 0;
314 create a aio_event_context structure.
316 static int aio_event_context_init(struct event_context *ev)
318 struct aio_event_context *aio_ev;
320 aio_ev = talloc_zero(ev, struct aio_event_context);
321 if (!aio_ev) return -1;
323 aio_ev->ev = ev;
324 aio_ev->epoll_iocb = talloc(aio_ev, struct iocb);
326 if (io_queue_init(MAX_AIO_QUEUE_DEPTH, &aio_ev->ioctx) != 0) {
327 return -1;
330 aio_ev->epoll_fd = epoll_create(MAX_AIO_QUEUE_DEPTH);
331 if (aio_ev->epoll_fd == -1) return -1;
333 talloc_set_destructor(aio_ev, aio_ctx_destructor);
335 ev->additional_data = aio_ev;
336 return 0;
340 destroy an fd_event
342 static int aio_event_fd_destructor(struct fd_event *fde)
344 struct event_context *ev = fde->event_ctx;
345 struct aio_event_context *aio_ev = talloc_get_type(ev->additional_data,
346 struct aio_event_context);
348 aio_ev->num_fd_events--;
349 aio_ev->destruction_count++;
351 epoll_del_event(aio_ev, fde);
353 return 0;
357 add a fd based event
358 return NULL on failure (memory allocation error)
360 static struct fd_event *aio_event_add_fd(struct event_context *ev, TALLOC_CTX *mem_ctx,
361 int fd, uint16_t flags,
362 event_fd_handler_t handler,
363 void *private_data)
365 struct aio_event_context *aio_ev = talloc_get_type(ev->additional_data,
366 struct aio_event_context);
367 struct fd_event *fde;
369 fde = talloc(mem_ctx?mem_ctx:ev, struct fd_event);
370 if (!fde) return NULL;
372 fde->event_ctx = ev;
373 fde->fd = fd;
374 fde->flags = flags;
375 fde->handler = handler;
376 fde->private_data = private_data;
377 fde->additional_flags = 0;
378 fde->additional_data = NULL;
380 aio_ev->num_fd_events++;
381 talloc_set_destructor(fde, aio_event_fd_destructor);
383 epoll_add_event(aio_ev, fde);
385 return fde;
390 return the fd event flags
392 static uint16_t aio_event_get_fd_flags(struct fd_event *fde)
394 return fde->flags;
398 set the fd event flags
400 static void aio_event_set_fd_flags(struct fd_event *fde, uint16_t flags)
402 struct event_context *ev;
403 struct aio_event_context *aio_ev;
405 if (fde->flags == flags) return;
407 ev = fde->event_ctx;
408 aio_ev = talloc_get_type(ev->additional_data, struct aio_event_context);
410 fde->flags = flags;
412 epoll_change_event(aio_ev, fde);
416 do a single event loop using the events defined in ev
418 static int aio_event_loop_once(struct event_context *ev)
420 struct aio_event_context *aio_ev = talloc_get_type(ev->additional_data,
421 struct aio_event_context);
422 struct timeval tval;
424 tval = common_event_loop_delay(ev);
426 if (timeval_is_zero(&tval)) {
427 common_event_loop_timer(ev);
428 return 0;
431 return aio_event_loop(aio_ev, &tval);
435 return on failure or (with 0) if all fd events are removed
437 static int aio_event_loop_wait(struct event_context *ev)
439 struct aio_event_context *aio_ev = talloc_get_type(ev->additional_data,
440 struct aio_event_context);
441 while (aio_ev->num_fd_events) {
442 if (aio_event_loop_once(ev) != 0) {
443 break;
447 return 0;
451 called when a disk IO event needs to be cancelled
453 static int aio_destructor(struct aio_event *ae)
455 struct event_context *ev = ae->event_ctx;
456 struct aio_event_context *aio_ev = talloc_get_type(ev->additional_data,
457 struct aio_event_context);
458 struct io_event result;
459 io_cancel(aio_ev->ioctx, &ae->iocb, &result);
460 /* TODO: handle errors from io_cancel()! */
461 return 0;
464 /* submit an aio disk IO event */
465 static struct aio_event *aio_event_add_aio(struct event_context *ev,
466 TALLOC_CTX *mem_ctx,
467 struct iocb *iocb,
468 event_aio_handler_t handler,
469 void *private_data)
471 struct aio_event_context *aio_ev = talloc_get_type(ev->additional_data,
472 struct aio_event_context);
473 struct iocb *iocbp;
474 struct aio_event *ae = talloc(mem_ctx?mem_ctx:ev, struct aio_event);
475 if (ae == NULL) return NULL;
477 ae->event_ctx = ev;
478 ae->iocb = *iocb;
479 ae->handler = handler;
480 ae->private_data = private_data;
481 iocbp = &ae->iocb;
483 if (io_submit(aio_ev->ioctx, 1, &iocbp) != 1) {
484 talloc_free(ae);
485 return NULL;
487 ae->iocb.data = ae;
488 talloc_set_destructor(ae, aio_destructor);
490 return ae;
493 static const struct event_ops aio_event_ops = {
494 .context_init = aio_event_context_init,
495 .add_fd = aio_event_add_fd,
496 .add_aio = aio_event_add_aio,
497 .get_fd_flags = aio_event_get_fd_flags,
498 .set_fd_flags = aio_event_set_fd_flags,
499 .add_timed = common_event_add_timed,
500 .add_signal = common_event_add_signal,
501 .loop_once = aio_event_loop_once,
502 .loop_wait = aio_event_loop_wait,
505 NTSTATUS events_aio_init(void)
507 return event_register_backend("aio", &aio_event_ops);