s3-linux-aio: Fix error handling
[Samba/gebeck_regimport.git] / source3 / modules / vfs_aio_linux.c
blob7b739429e490d7c9ae2cd4272d18af78419847ca
1 /*
2 * Simulate Posix AIO using Linux kernel AIO.
4 * Copyright (C) Jeremy Allison 2012
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 3 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21 #include "includes.h"
22 #include "system/filesys.h"
23 #include "smbd/smbd.h"
24 #include "smbd/globals.h"
25 #include <sys/eventfd.h>
26 #include <libaio.h>
28 struct aio_extra;
29 static int event_fd = -1;
30 static io_context_t io_ctx;
31 static int aio_linux_requestid;
32 static struct io_event *io_recv_events;
33 static struct fd_event *aio_read_event;
35 struct aio_private_data {
36 struct aio_private_data *prev, *next;
37 int requestid;
38 SMB_STRUCT_AIOCB *aiocb;
39 struct iocb *event_iocb;
40 ssize_t ret_size;
41 int ret_errno;
42 bool cancelled;
45 /* List of outstanding requests we have. */
46 static struct aio_private_data *pd_list;
48 static void aio_linux_handle_completion(struct event_context *event_ctx,
49 struct fd_event *event,
50 uint16 flags,
51 void *p);
53 /************************************************************************
54 Housekeeping. Cleanup if no activity for 30 seconds.
55 ***********************************************************************/
57 static void aio_linux_housekeeping(struct tevent_context *event_ctx,
58 struct tevent_timer *te,
59 struct timeval now,
60 void *private_data)
62 /* Remove this timed event handler. */
63 TALLOC_FREE(te);
65 if (pd_list != NULL) {
66 /* Still busy. Look again in 30 seconds. */
67 (void)tevent_add_timer(event_ctx,
68 NULL,
69 timeval_current_ofs(30, 0),
70 aio_linux_housekeeping,
71 NULL);
72 return;
75 /* No activity for 30 seconds. Close out kernel resources. */
76 io_queue_release(io_ctx);
77 memset(&io_ctx, '\0', sizeof(io_ctx));
79 if (event_fd != -1) {
80 close(event_fd);
81 event_fd = -1;
84 TALLOC_FREE(aio_read_event);
85 TALLOC_FREE(io_recv_events);
88 /************************************************************************
89 Ensure event fd and aio context are initialized.
90 ***********************************************************************/
92 static bool init_aio_linux(struct vfs_handle_struct *handle)
94 struct tevent_timer *te = NULL;
96 if (event_fd != -1) {
97 /* Already initialized. */
98 return true;
101 /* Schedule a shutdown event for 30 seconds from now. */
102 te = tevent_add_timer(server_event_context(),
103 NULL,
104 timeval_current_ofs(30, 0),
105 aio_linux_housekeeping,
106 NULL);
108 if (te == NULL) {
109 goto fail;
112 /* Ensure we have enough space for aio_pending_size events. */
113 io_recv_events = talloc_zero_array(NULL,
114 struct io_event,
115 aio_pending_size);
116 if (io_recv_events == NULL) {
117 goto fail;
120 event_fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
121 if (event_fd == -1) {
122 goto fail;
125 aio_read_event = tevent_add_fd(server_event_context(),
126 NULL,
127 event_fd,
128 TEVENT_FD_READ,
129 aio_linux_handle_completion,
130 NULL);
131 if (aio_read_event == NULL) {
132 goto fail;
135 if (io_queue_init(aio_pending_size, &io_ctx)) {
136 goto fail;
139 DEBUG(10,("init_aio_linux: initialized with up to %d events\n",
140 aio_pending_size));
142 return true;
144 fail:
146 DEBUG(10,("init_aio_linux: initialization failed\n"));
148 TALLOC_FREE(te);
149 TALLOC_FREE(io_recv_events);
150 TALLOC_FREE(aio_read_event);
151 if (event_fd != -1) {
152 close(event_fd);
153 event_fd = -1;
155 memset(&io_ctx, '\0', sizeof(io_ctx));
156 return false;
159 /************************************************************************
160 Private data destructor.
161 ***********************************************************************/
163 static int pd_destructor(struct aio_private_data *pd)
165 DLIST_REMOVE(pd_list, pd);
166 return 0;
169 /************************************************************************
170 Create and initialize a private data struct.
171 ***********************************************************************/
173 static struct aio_private_data *create_private_data(TALLOC_CTX *ctx,
174 SMB_STRUCT_AIOCB *aiocb)
176 struct aio_private_data *pd = talloc_zero(ctx, struct aio_private_data);
177 if (!pd) {
178 return NULL;
180 pd->event_iocb = talloc_zero(pd, struct iocb);
181 pd->requestid = aio_linux_requestid++;
182 pd->aiocb = aiocb;
183 pd->ret_size = -1;
184 pd->ret_errno = EINPROGRESS;
185 talloc_set_destructor(pd, pd_destructor);
186 DLIST_ADD_END(pd_list, pd, struct aio_private_data *);
187 return pd;
190 /************************************************************************
191 Initiate an asynchronous pread call.
192 ***********************************************************************/
194 static int aio_linux_read(struct vfs_handle_struct *handle,
195 struct files_struct *fsp,
196 SMB_STRUCT_AIOCB *aiocb)
198 struct aio_extra *aio_ex = (struct aio_extra *)aiocb->aio_sigevent.sigev_value.sival_ptr;
199 struct aio_private_data *pd = NULL;
200 int ret;
202 if (!init_aio_linux(handle)) {
203 return -1;
206 pd = create_private_data(aio_ex, aiocb);
207 if (pd == NULL) {
208 DEBUG(10, ("aio_linux_read: Could not create private data.\n"));
209 return -1;
212 io_prep_pread(pd->event_iocb,
213 pd->aiocb->aio_fildes,
214 discard_const(pd->aiocb->aio_buf),
215 pd->aiocb->aio_nbytes,
216 pd->aiocb->aio_offset);
217 io_set_eventfd(pd->event_iocb, event_fd);
218 /* Use the callback pointer as a private data ptr. */
219 io_set_callback(pd->event_iocb, (io_callback_t)pd);
221 ret = io_submit(io_ctx, 1, &pd->event_iocb);
222 if (ret < 0) {
223 errno = ret;
224 return -1;
227 DEBUG(10, ("aio_linux_read: requestid=%d read requested "
228 "of %llu bytes at offset %llu\n",
229 pd->requestid,
230 (unsigned long long)pd->aiocb->aio_nbytes,
231 (unsigned long long)pd->aiocb->aio_offset));
233 return 0;
236 /************************************************************************
237 Initiate an asynchronous pwrite call.
238 ***********************************************************************/
240 static int aio_linux_write(struct vfs_handle_struct *handle,
241 struct files_struct *fsp,
242 SMB_STRUCT_AIOCB *aiocb)
244 struct aio_extra *aio_ex = (struct aio_extra *)aiocb->aio_sigevent.sigev_value.sival_ptr;
245 struct aio_private_data *pd = NULL;
246 int ret;
248 if (!init_aio_linux(handle)) {
249 return -1;
252 pd = create_private_data(aio_ex, aiocb);
253 if (pd == NULL) {
254 DEBUG(10, ("aio_linux_write: Could not create private data.\n"));
255 return -1;
258 io_prep_pwrite(pd->event_iocb,
259 pd->aiocb->aio_fildes,
260 discard_const(pd->aiocb->aio_buf),
261 pd->aiocb->aio_nbytes,
262 pd->aiocb->aio_offset);
263 io_set_eventfd(pd->event_iocb, event_fd);
264 /* Use the callback pointer as a private data ptr. */
265 io_set_callback(pd->event_iocb, (io_callback_t)pd);
267 ret = io_submit(io_ctx, 1, &pd->event_iocb);
268 if (ret < 0) {
269 errno = ret;
270 return -1;
273 DEBUG(10, ("aio_linux_write: requestid=%d pwrite requested "
274 "of %llu bytes at offset %llu\n",
275 pd->requestid,
276 (unsigned long long)pd->aiocb->aio_nbytes,
277 (unsigned long long)pd->aiocb->aio_offset));
279 return 0;
282 /************************************************************************
283 Save off the error / success conditions from the io_event.
284 Is idempotent (can be called multiple times given the same ioev).
285 ***********************************************************************/
287 static void aio_linux_setup_returns(struct io_event *ioev)
289 struct aio_private_data *pd = (struct aio_private_data *)ioev->data;
291 if (ioev->res < 0) {
292 pd->ret_size = -1;
293 pd->ret_errno = -ioev->res;
294 } else {
295 pd->ret_size = ioev->res;
296 pd->ret_errno = 0;
300 /************************************************************************
301 Handle a single finished io.
302 ***********************************************************************/
304 static void aio_linux_handle_io_finished(struct io_event *ioev)
306 struct aio_extra *aio_ex = NULL;
307 struct aio_private_data *pd = (struct aio_private_data *)ioev->data;
309 aio_linux_setup_returns(ioev);
311 aio_ex = (struct aio_extra *)pd->aiocb->aio_sigevent.sigev_value.sival_ptr;
312 smbd_aio_complete_aio_ex(aio_ex);
314 DEBUG(10,("aio_linux_handle_io_finished: requestid %d completed\n",
315 pd->requestid ));
316 TALLOC_FREE(aio_ex);
319 /************************************************************************
320 Callback when multiple IOs complete.
321 ***********************************************************************/
323 static void aio_linux_handle_completion(struct event_context *event_ctx,
324 struct fd_event *event,
325 uint16 flags,
326 void *p)
328 uint64_t num_events = 0;
330 DEBUG(10, ("aio_linux_handle_completion called with flags=%d\n",
331 (int)flags));
333 if ((flags & EVENT_FD_READ) == 0) {
334 return;
337 /* Read the number of events available. */
338 if (sys_read(event_fd, &num_events, sizeof(num_events)) !=
339 sizeof(num_events)) {
340 smb_panic("aio_linux_handle_completion: invalid read");
343 while (num_events > 0) {
344 uint64_t events_to_read = MIN(num_events, aio_pending_size);
345 struct timespec ts;
346 int i;
347 int ret;
349 ts.tv_sec = 0;
350 ts.tv_nsec = 0;
352 ret = io_getevents(io_ctx,
354 (long)events_to_read,
355 io_recv_events,
356 &ts);
358 if (ret < 0) {
359 errno = -ret;
360 DEBUG(1, ("aio_linux_handle_completion: "
361 "io_getevents error %s\n",
362 strerror(errno) ));
363 return;
366 if (ret == 0) {
367 DEBUG(10, ("aio_linux_handle_completion: "
368 "io_getevents returned 0\n"));
369 continue;
372 /* ret is positive. */
373 for (i = 0; i < ret; i++) {
374 aio_linux_handle_io_finished(&io_recv_events[i]);
377 num_events -= ret;
381 /************************************************************************
382 Find the private data by aiocb.
383 ***********************************************************************/
385 static struct aio_private_data *find_private_data_by_aiocb(SMB_STRUCT_AIOCB *aiocb)
387 struct aio_private_data *pd;
389 for (pd = pd_list; pd != NULL; pd = pd->next) {
390 if (pd->aiocb == aiocb) {
391 return pd;
395 return NULL;
398 /************************************************************************
399 Called to return the result of a completed AIO.
400 Should only be called if aio_error returns something other than EINPROGRESS.
401 Returns:
402 Any other value - return from IO operation.
403 ***********************************************************************/
405 static ssize_t aio_linux_return_fn(struct vfs_handle_struct *handle,
406 struct files_struct *fsp,
407 SMB_STRUCT_AIOCB *aiocb)
409 struct aio_private_data *pd = find_private_data_by_aiocb(aiocb);
411 if (pd == NULL) {
412 errno = EINVAL;
413 DEBUG(0, ("aio_linux_return_fn: returning EINVAL\n"));
414 return -1;
417 pd->aiocb = NULL;
419 if (pd->cancelled) {
420 errno = ECANCELED;
421 return -1;
424 if (pd->ret_size == -1) {
425 errno = pd->ret_errno;
428 return pd->ret_size;
431 /************************************************************************
432 Called to check the result of an AIO.
433 Returns:
434 EINPROGRESS - still in progress.
435 EINVAL - invalid aiocb.
436 ECANCELED - request was cancelled.
437 0 - request completed successfully.
438 Any other value - errno from IO operation.
439 ***********************************************************************/
441 static int aio_linux_error_fn(struct vfs_handle_struct *handle,
442 struct files_struct *fsp,
443 SMB_STRUCT_AIOCB *aiocb)
445 struct aio_private_data *pd = find_private_data_by_aiocb(aiocb);
447 if (pd == NULL) {
448 return EINVAL;
450 if (pd->cancelled) {
451 return ECANCELED;
453 return pd->ret_errno;
456 /************************************************************************
457 Called to request the cancel of an AIO, or all of them on a specific
458 fsp if aiocb == NULL.
459 ***********************************************************************/
461 static int aio_linux_cancel(struct vfs_handle_struct *handle,
462 struct files_struct *fsp,
463 SMB_STRUCT_AIOCB *aiocb)
465 struct aio_private_data *pd = NULL;
467 for (pd = pd_list; pd != NULL; pd = pd->next) {
468 if (pd->aiocb == NULL) {
469 continue;
471 if (pd->aiocb->aio_fildes != fsp->fh->fd) {
472 continue;
474 if ((aiocb != NULL) && (pd->aiocb != aiocb)) {
475 continue;
479 * We let the kernel do its job, but we discard the result when
480 * it's finished. NB. Should I call io_cancel here ?
483 pd->cancelled = true;
486 return AIO_CANCELED;
489 /************************************************************************
490 Callback for a previously detected job completion deferred to the main
491 loop.
492 ***********************************************************************/
494 static void aio_linux_handle_immediate(struct tevent_context *ctx,
495 struct tevent_immediate *im,
496 void *private_data)
498 struct io_event *ioev = (struct io_event *)private_data;
500 aio_linux_handle_io_finished(ioev);
501 TALLOC_FREE(ioev);
504 /************************************************************************
505 Private data struct used in suspend completion code.
506 ***********************************************************************/
508 struct suspend_private {
509 int num_entries;
510 int num_finished;
511 const SMB_STRUCT_AIOCB * const *aiocb_array;
514 /************************************************************************
515 Handle a single finished io from suspend.
516 ***********************************************************************/
518 static void aio_linux_handle_suspend_io_finished(struct suspend_private *sp,
519 struct io_event *ioev)
521 struct aio_private_data *pd = (struct aio_private_data *)ioev->data;
522 struct io_event *new_ioev = NULL;
523 struct tevent_immediate *im = NULL;
524 int i;
526 /* Is this a requestid with an aiocb we're interested in ? */
527 for (i = 0; i < sp->num_entries; i++) {
528 if (sp->aiocb_array[i] == pd->aiocb) {
529 sp->num_finished++;
531 * We don't call aio_linux_handle_io_finished()
532 * here, but only the function that sets up the
533 * return values. This allows
534 * aio_linux_handle_io_finished() to be successfully
535 * called from smbd/aio.c:wait_for_aio_completion()
536 * once we return from here with all io's done.
538 aio_linux_setup_returns(ioev);
539 return;
543 /* Jobid completed we weren't waiting for.
544 We must reshedule this as an immediate event
545 on the main event context. */
546 im = tevent_create_immediate(NULL);
547 if (!im) {
548 exit_server_cleanly("aio_linux_handle_suspend_completion: no memory");
551 new_ioev = (struct io_event *)talloc_memdup(NULL,
552 ioev,
553 sizeof(struct io_event));
554 if (!new_ioev) {
555 exit_server_cleanly("aio_linux_handle_suspend_completion: no memory");
558 DEBUG(10,("aio_linux_handle_suspend_completion: "
559 "re-scheduling requestid %d\n",
560 pd->requestid));
562 tevent_schedule_immediate(im,
563 server_event_context(),
564 aio_linux_handle_immediate,
565 (void *)new_ioev);
568 /************************************************************************
569 Callback when an IO completes from a suspend call.
570 ***********************************************************************/
572 static void aio_linux_handle_suspend_completion(struct event_context *event_ctx,
573 struct fd_event *event,
574 uint16 flags,
575 void *p)
577 struct suspend_private *sp = (struct suspend_private *)p;
578 uint64_t remaining_events = sp->num_entries - sp->num_finished;
579 uint64_t num_events = 0;
581 DEBUG(10, ("aio_linux_handle_suspend_completion called with flags=%d\n",
582 (int)flags));
584 if ((flags & EVENT_FD_READ) == 0) {
585 return;
588 /* Read the number of events available. */
589 if (sys_read(event_fd, &num_events, sizeof(num_events)) !=
590 sizeof(num_events)) {
591 smb_panic("aio_linux_handle_completion: invalid read");
594 while (num_events > 0) {
595 uint64_t events_to_read = MIN(num_events, remaining_events);
596 struct timespec ts;
597 int i;
598 int ret;
600 ts.tv_sec = 0;
601 ts.tv_nsec = 0;
603 ret = io_getevents(io_ctx,
605 (long)events_to_read,
606 io_recv_events,
607 &ts);
609 if (ret < 0) {
610 errno = -ret;
611 DEBUG(1, ("aio_linux_handle_suspend_completion: "
612 "io_getevents error %s\n",
613 strerror(errno) ));
614 return;
617 if (ret == 0) {
618 DEBUG(10, ("aio_linux_handle_suspend_completion: "
619 "io_getevents returned 0\n"));
620 continue;
623 /* ret is positive. */
624 for (i = 0; i < ret; i++) {
625 aio_linux_handle_suspend_io_finished(sp,
626 &io_recv_events[i]);
629 num_events -= ret;
633 static void aio_linux_suspend_timed_out(struct tevent_context *event_ctx,
634 struct tevent_timer *te,
635 struct timeval now,
636 void *private_data)
638 bool *timed_out = (bool *)private_data;
639 /* Remove this timed event handler. */
640 TALLOC_FREE(te);
641 *timed_out = true;
644 /************************************************************************
645 Called to request everything to stop until all IO is completed.
646 ***********************************************************************/
648 static int aio_linux_suspend(struct vfs_handle_struct *handle,
649 struct files_struct *fsp,
650 const SMB_STRUCT_AIOCB * const aiocb_array[],
651 int n,
652 const struct timespec *timeout)
654 struct event_context *ev = NULL;
655 struct fd_event *sock_event = NULL;
656 int ret = -1;
657 struct suspend_private sp;
658 bool timed_out = false;
659 TALLOC_CTX *frame = talloc_stackframe();
661 /* This is a blocking call, and has to use a sub-event loop. */
662 ev = event_context_init(frame);
663 if (ev == NULL) {
664 errno = ENOMEM;
665 goto out;
668 if (timeout) {
669 struct timeval tv = convert_timespec_to_timeval(*timeout);
670 struct tevent_timer *te = tevent_add_timer(ev,
671 frame,
672 timeval_current_ofs(tv.tv_sec,
673 tv.tv_usec),
674 aio_linux_suspend_timed_out,
675 &timed_out);
676 if (!te) {
677 errno = ENOMEM;
678 goto out;
682 ZERO_STRUCT(sp);
683 sp.num_entries = n;
684 sp.aiocb_array = aiocb_array;
685 sp.num_finished = 0;
687 sock_event = tevent_add_fd(ev,
688 frame,
689 event_fd,
690 TEVENT_FD_READ,
691 aio_linux_handle_suspend_completion,
692 (void *)&sp);
693 if (sock_event == NULL) {
694 goto out;
697 * We're going to cheat here. We know that smbd/aio.c
698 * only calls this when it's waiting for every single
699 * outstanding call to finish on a close, so just wait
700 * individually for each IO to complete. We don't care
701 * what order they finish - only that they all do. JRA.
703 while (sp.num_entries != sp.num_finished) {
704 if (tevent_loop_once(ev) == -1) {
705 goto out;
708 if (timed_out) {
709 errno = EAGAIN;
710 goto out;
714 ret = 0;
716 out:
718 TALLOC_FREE(frame);
719 return ret;
722 static int aio_linux_connect(vfs_handle_struct *handle, const char *service,
723 const char *user)
725 /*********************************************************************
726 * How many io_events to initialize ?
727 * 128 per process seems insane as a default until you realize that
728 * (a) Throttling is done in SMB2 via the crediting algorithm.
729 * (b) SMB1 clients are limited to max_mux (50) outstanding
730 * requests and Windows clients don't use this anyway.
731 * Essentially we want this to be unlimited unless smb.conf
732 * says different.
733 *********************************************************************/
734 aio_pending_size = lp_parm_int(
735 SNUM(handle->conn), "aio_linux", "aio num events", 128);
736 return SMB_VFS_NEXT_CONNECT(handle, service, user);
739 static struct vfs_fn_pointers vfs_aio_linux_fns = {
740 .connect_fn = aio_linux_connect,
741 .aio_read_fn = aio_linux_read,
742 .aio_write_fn = aio_linux_write,
743 .aio_return_fn = aio_linux_return_fn,
744 .aio_cancel_fn = aio_linux_cancel,
745 .aio_error_fn = aio_linux_error_fn,
746 .aio_suspend_fn = aio_linux_suspend,
749 NTSTATUS vfs_aio_linux_init(void)
751 return smb_register_vfs(SMB_VFS_INTERFACE_VERSION,
752 "aio_linux", &vfs_aio_linux_fns);