s3: Fix a typo
[Samba/gebeck_regimport.git] / source3 / modules / vfs_aio_linux.c
blobd49dc49cc46fab151bcfb8f0e1524626e6535839
1 /*
2 * Simulate Posix AIO using Linux kernel AIO.
4 * Copyright (C) Jeremy Allison 2012
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 3 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21 #include "includes.h"
22 #include "system/filesys.h"
23 #include "smbd/smbd.h"
24 #include "smbd/globals.h"
25 #include <sys/eventfd.h>
26 #include <libaio.h>
28 struct aio_extra;
29 static int event_fd = -1;
30 static io_context_t io_ctx;
31 static int aio_linux_requestid;
32 static struct io_event *io_recv_events;
33 static struct fd_event *aio_read_event;
35 struct aio_private_data {
36 struct aio_private_data *prev, *next;
37 int requestid;
38 SMB_STRUCT_AIOCB *aiocb;
39 struct iocb *event_iocb;
40 ssize_t ret_size;
41 int ret_errno;
42 bool cancelled;
45 /* List of outstanding requests we have. */
46 static struct aio_private_data *pd_list;
48 static void aio_linux_handle_completion(struct event_context *event_ctx,
49 struct fd_event *event,
50 uint16 flags,
51 void *p);
53 /************************************************************************
54 Housekeeping. Cleanup if no activity for 30 seconds.
55 ***********************************************************************/
57 static void aio_linux_housekeeping(struct tevent_context *event_ctx,
58 struct tevent_timer *te,
59 struct timeval now,
60 void *private_data)
62 /* Remove this timed event handler. */
63 TALLOC_FREE(te);
65 if (pd_list != NULL) {
66 /* Still busy. Look again in 30 seconds. */
67 (void)tevent_add_timer(event_ctx,
68 NULL,
69 timeval_current_ofs(30, 0),
70 aio_linux_housekeeping,
71 NULL);
72 return;
75 /* No activity for 30 seconds. Close out kernel resources. */
76 io_queue_release(io_ctx);
77 memset(&io_ctx, '\0', sizeof(io_ctx));
79 if (event_fd != -1) {
80 close(event_fd);
81 event_fd = -1;
84 TALLOC_FREE(aio_read_event);
85 TALLOC_FREE(io_recv_events);
88 /************************************************************************
89 Ensure event fd and aio context are initialized.
90 ***********************************************************************/
92 static bool init_aio_linux(struct vfs_handle_struct *handle)
94 struct tevent_timer *te = NULL;
96 if (event_fd != -1) {
97 /* Already initialized. */
98 return true;
101 /* Schedule a shutdown event for 30 seconds from now. */
102 te = tevent_add_timer(server_event_context(),
103 NULL,
104 timeval_current_ofs(30, 0),
105 aio_linux_housekeeping,
106 NULL);
108 if (te == NULL) {
109 goto fail;
112 /* Ensure we have enough space for aio_pending_size events. */
113 io_recv_events = talloc_zero_array(NULL,
114 struct io_event,
115 aio_pending_size);
116 if (io_recv_events == NULL) {
117 goto fail;
120 event_fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
121 if (event_fd == -1) {
122 goto fail;
125 aio_read_event = tevent_add_fd(server_event_context(),
126 NULL,
127 event_fd,
128 TEVENT_FD_READ,
129 aio_linux_handle_completion,
130 NULL);
131 if (aio_read_event == NULL) {
132 goto fail;
135 if (io_queue_init(aio_pending_size, &io_ctx)) {
136 goto fail;
139 DEBUG(10,("init_aio_linux: initialized with up to %d events\n",
140 aio_pending_size));
142 return true;
144 fail:
146 DEBUG(10,("init_aio_linux: initialization failed\n"));
148 TALLOC_FREE(te);
149 TALLOC_FREE(io_recv_events);
150 TALLOC_FREE(aio_read_event);
151 if (event_fd != -1) {
152 close(event_fd);
153 event_fd = -1;
155 memset(&io_ctx, '\0', sizeof(io_ctx));
156 return false;
159 /************************************************************************
160 Private data destructor.
161 ***********************************************************************/
163 static int pd_destructor(struct aio_private_data *pd)
165 DLIST_REMOVE(pd_list, pd);
166 return 0;
169 /************************************************************************
170 Create and initialize a private data struct.
171 ***********************************************************************/
173 static struct aio_private_data *create_private_data(TALLOC_CTX *ctx,
174 SMB_STRUCT_AIOCB *aiocb)
176 struct aio_private_data *pd = talloc_zero(ctx, struct aio_private_data);
177 if (!pd) {
178 return NULL;
180 pd->event_iocb = talloc_zero(pd, struct iocb);
181 pd->requestid = aio_linux_requestid++;
182 pd->aiocb = aiocb;
183 pd->ret_size = -1;
184 pd->ret_errno = EINPROGRESS;
185 talloc_set_destructor(pd, pd_destructor);
186 DLIST_ADD_END(pd_list, pd, struct aio_private_data *);
187 return pd;
190 /************************************************************************
191 Initiate an asynchronous pread call.
192 ***********************************************************************/
194 static int aio_linux_read(struct vfs_handle_struct *handle,
195 struct files_struct *fsp,
196 SMB_STRUCT_AIOCB *aiocb)
198 struct aio_extra *aio_ex = (struct aio_extra *)aiocb->aio_sigevent.sigev_value.sival_ptr;
199 struct aio_private_data *pd = NULL;
200 int ret;
202 if (!init_aio_linux(handle)) {
203 return -1;
206 pd = create_private_data(aio_ex, aiocb);
207 if (pd == NULL) {
208 DEBUG(10, ("aio_linux_read: Could not create private data.\n"));
209 return -1;
212 io_prep_pread(pd->event_iocb,
213 pd->aiocb->aio_fildes,
214 discard_const(pd->aiocb->aio_buf),
215 pd->aiocb->aio_nbytes,
216 pd->aiocb->aio_offset);
217 io_set_eventfd(pd->event_iocb, event_fd);
218 /* Use the callback pointer as a private data ptr. */
219 io_set_callback(pd->event_iocb, (io_callback_t)pd);
221 ret = io_submit(io_ctx, 1, &pd->event_iocb);
222 if (ret < 0) {
223 errno = ret;
224 return -1;
227 DEBUG(10, ("aio_linux_read: requestid=%d read requested "
228 "of %llu bytes at offset %llu\n",
229 pd->requestid,
230 (unsigned long long)pd->aiocb->aio_nbytes,
231 (unsigned long long)pd->aiocb->aio_offset));
233 return 0;
236 /************************************************************************
237 Initiate an asynchronous pwrite call.
238 ***********************************************************************/
240 static int aio_linux_write(struct vfs_handle_struct *handle,
241 struct files_struct *fsp,
242 SMB_STRUCT_AIOCB *aiocb)
244 struct aio_extra *aio_ex = (struct aio_extra *)aiocb->aio_sigevent.sigev_value.sival_ptr;
245 struct aio_private_data *pd = NULL;
246 int ret;
248 if (!init_aio_linux(handle)) {
249 return -1;
252 pd = create_private_data(aio_ex, aiocb);
253 if (pd == NULL) {
254 DEBUG(10, ("aio_linux_write: Could not create private data.\n"));
255 return -1;
258 io_prep_pwrite(pd->event_iocb,
259 pd->aiocb->aio_fildes,
260 discard_const(pd->aiocb->aio_buf),
261 pd->aiocb->aio_nbytes,
262 pd->aiocb->aio_offset);
263 io_set_eventfd(pd->event_iocb, event_fd);
264 /* Use the callback pointer as a private data ptr. */
265 io_set_callback(pd->event_iocb, (io_callback_t)pd);
267 ret = io_submit(io_ctx, 1, &pd->event_iocb);
268 if (ret < 0) {
269 errno = ret;
270 return -1;
273 DEBUG(10, ("aio_linux_write: requestid=%d pwrite requested "
274 "of %llu bytes at offset %llu\n",
275 pd->requestid,
276 (unsigned long long)pd->aiocb->aio_nbytes,
277 (unsigned long long)pd->aiocb->aio_offset));
279 return 0;
282 /************************************************************************
283 Handle a single finished io.
284 ***********************************************************************/
286 static void aio_linux_handle_io_finished(struct io_event *ioev)
288 struct aio_extra *aio_ex = NULL;
289 struct aio_private_data *pd = (struct aio_private_data *)ioev->data;
291 /* ioev->res2 contains the -errno if error. */
292 /* ioev->res contains the number of bytes sent/received. */
293 if (ioev->res2) {
294 pd->ret_size = -1;
295 pd->ret_errno = -ioev->res2;
296 } else {
297 pd->ret_size = ioev->res;
298 pd->ret_errno = 0;
301 aio_ex = (struct aio_extra *)pd->aiocb->aio_sigevent.sigev_value.sival_ptr;
302 smbd_aio_complete_aio_ex(aio_ex);
304 DEBUG(10,("aio_linux_handle_io_finished: requestid %d completed\n",
305 pd->requestid ));
306 TALLOC_FREE(aio_ex);
309 /************************************************************************
310 Callback when multiple IOs complete.
311 ***********************************************************************/
313 static void aio_linux_handle_completion(struct event_context *event_ctx,
314 struct fd_event *event,
315 uint16 flags,
316 void *p)
318 uint64_t num_events = 0;
320 DEBUG(10, ("aio_linux_handle_completion called with flags=%d\n",
321 (int)flags));
323 if ((flags & EVENT_FD_READ) == 0) {
324 return;
327 /* Read the number of events available. */
328 if (sys_read(event_fd, &num_events, sizeof(num_events)) !=
329 sizeof(num_events)) {
330 smb_panic("aio_linux_handle_completion: invalid read");
333 while (num_events > 0) {
334 uint64_t events_to_read = MIN(num_events, aio_pending_size);
335 struct timespec ts;
336 int i;
337 int ret;
339 ts.tv_sec = 0;
340 ts.tv_nsec = 0;
342 ret = io_getevents(io_ctx,
344 (long)events_to_read,
345 io_recv_events,
346 &ts);
348 if (ret < 0) {
349 errno = -ret;
350 DEBUG(1, ("aio_linux_handle_completion: "
351 "io_getevents error %s\n",
352 strerror(errno) ));
353 return;
356 if (ret == 0) {
357 DEBUG(10, ("aio_linux_handle_completion: "
358 "io_getevents returned 0\n"));
359 continue;
362 /* ret is positive. */
363 for (i = 0; i < ret; i++) {
364 aio_linux_handle_io_finished(&io_recv_events[i]);
367 num_events -= ret;
371 /************************************************************************
372 Find the private data by aiocb.
373 ***********************************************************************/
375 static struct aio_private_data *find_private_data_by_aiocb(SMB_STRUCT_AIOCB *aiocb)
377 struct aio_private_data *pd;
379 for (pd = pd_list; pd != NULL; pd = pd->next) {
380 if (pd->aiocb == aiocb) {
381 return pd;
385 return NULL;
388 /************************************************************************
389 Called to return the result of a completed AIO.
390 Should only be called if aio_error returns something other than EINPROGRESS.
391 Returns:
392 Any other value - return from IO operation.
393 ***********************************************************************/
395 static ssize_t aio_linux_return_fn(struct vfs_handle_struct *handle,
396 struct files_struct *fsp,
397 SMB_STRUCT_AIOCB *aiocb)
399 struct aio_private_data *pd = find_private_data_by_aiocb(aiocb);
401 if (pd == NULL) {
402 errno = EINVAL;
403 DEBUG(0, ("aio_linux_return_fn: returning EINVAL\n"));
404 return -1;
407 pd->aiocb = NULL;
409 if (pd->ret_size == -1) {
410 errno = pd->ret_errno;
413 return pd->ret_size;
416 /************************************************************************
417 Called to check the result of an AIO.
418 Returns:
419 EINPROGRESS - still in progress.
420 EINVAL - invalid aiocb.
421 ECANCELED - request was cancelled.
422 0 - request completed successfully.
423 Any other value - errno from IO operation.
424 ***********************************************************************/
426 static int aio_linux_error_fn(struct vfs_handle_struct *handle,
427 struct files_struct *fsp,
428 SMB_STRUCT_AIOCB *aiocb)
430 struct aio_private_data *pd = find_private_data_by_aiocb(aiocb);
432 if (pd == NULL) {
433 return EINVAL;
435 if (pd->cancelled) {
436 return ECANCELED;
438 return pd->ret_errno;
441 /************************************************************************
442 Called to request the cancel of an AIO, or all of them on a specific
443 fsp if aiocb == NULL.
444 ***********************************************************************/
446 static int aio_linux_cancel(struct vfs_handle_struct *handle,
447 struct files_struct *fsp,
448 SMB_STRUCT_AIOCB *aiocb)
450 struct aio_private_data *pd = NULL;
452 for (pd = pd_list; pd != NULL; pd = pd->next) {
453 if (pd->aiocb == NULL) {
454 continue;
456 if (pd->aiocb->aio_fildes != fsp->fh->fd) {
457 continue;
459 if ((aiocb != NULL) && (pd->aiocb != aiocb)) {
460 continue;
464 * We let the kernel do its job, but we discard the result when
465 * it's finished. NB. Should I call io_cancel here ?
468 pd->cancelled = true;
471 return AIO_CANCELED;
474 /************************************************************************
475 Callback for a previously detected job completion deferred to the main
476 loop.
477 ***********************************************************************/
479 static void aio_linux_handle_immediate(struct tevent_context *ctx,
480 struct tevent_immediate *im,
481 void *private_data)
483 struct io_event *ioev = (struct io_event *)private_data;
485 aio_linux_handle_io_finished(ioev);
486 TALLOC_FREE(ioev);
489 /************************************************************************
490 Private data struct used in suspend completion code.
491 ***********************************************************************/
493 struct suspend_private {
494 int num_entries;
495 int num_finished;
496 const SMB_STRUCT_AIOCB * const *aiocb_array;
499 /************************************************************************
500 Handle a single finished io from suspend.
501 ***********************************************************************/
503 static void aio_linux_handle_suspend_io_finished(struct suspend_private *sp,
504 struct io_event *ioev)
506 struct aio_private_data *pd = (struct aio_private_data *)ioev->data;
507 struct io_event *new_ioev = NULL;
508 struct tevent_immediate *im = NULL;
509 int i;
511 /* Is this a requestid with an aiocb we're interested in ? */
512 for (i = 0; i < sp->num_entries; i++) {
513 if (sp->aiocb_array[i] == pd->aiocb) {
514 sp->num_finished++;
515 aio_linux_handle_io_finished(ioev);
516 return;
520 /* Jobid completed we weren't waiting for.
521 We must reshedule this as an immediate event
522 on the main event context. */
523 im = tevent_create_immediate(NULL);
524 if (!im) {
525 exit_server_cleanly("aio_linux_handle_suspend_completion: no memory");
528 new_ioev = (struct io_event *)talloc_memdup(NULL,
529 ioev,
530 sizeof(struct io_event));
531 if (!new_ioev) {
532 exit_server_cleanly("aio_linux_handle_suspend_completion: no memory");
535 DEBUG(10,("aio_linux_handle_suspend_completion: "
536 "re-scheduling requestid %d\n",
537 pd->requestid));
539 tevent_schedule_immediate(im,
540 server_event_context(),
541 aio_linux_handle_immediate,
542 (void *)new_ioev);
545 /************************************************************************
546 Callback when an IO completes from a suspend call.
547 ***********************************************************************/
549 static void aio_linux_handle_suspend_completion(struct event_context *event_ctx,
550 struct fd_event *event,
551 uint16 flags,
552 void *p)
554 struct suspend_private *sp = (struct suspend_private *)p;
555 uint64_t remaining_events = sp->num_entries - sp->num_finished;
556 uint64_t num_events = 0;
558 DEBUG(10, ("aio_linux_handle_suspend_completion called with flags=%d\n",
559 (int)flags));
561 if ((flags & EVENT_FD_READ) == 0) {
562 return;
565 /* Read the number of events available. */
566 if (sys_read(event_fd, &num_events, sizeof(num_events)) !=
567 sizeof(num_events)) {
568 smb_panic("aio_linux_handle_completion: invalid read");
571 while (num_events > 0) {
572 uint64_t events_to_read = MIN(num_events, remaining_events);
573 struct timespec ts;
574 int i;
575 int ret;
577 ts.tv_sec = 0;
578 ts.tv_nsec = 0;
580 ret = io_getevents(io_ctx,
582 (long)events_to_read,
583 io_recv_events,
584 &ts);
586 if (ret < 0) {
587 errno = -ret;
588 DEBUG(1, ("aio_linux_handle_suspend_completion: "
589 "io_getevents error %s\n",
590 strerror(errno) ));
591 return;
594 if (ret == 0) {
595 DEBUG(10, ("aio_linux_handle_suspend_completion: "
596 "io_getevents returned 0\n"));
597 continue;
600 /* ret is positive. */
601 for (i = 0; i < ret; i++) {
602 aio_linux_handle_suspend_io_finished(sp,
603 &io_recv_events[i]);
606 num_events -= ret;
610 static void aio_linux_suspend_timed_out(struct tevent_context *event_ctx,
611 struct tevent_timer *te,
612 struct timeval now,
613 void *private_data)
615 bool *timed_out = (bool *)private_data;
616 /* Remove this timed event handler. */
617 TALLOC_FREE(te);
618 *timed_out = true;
621 /************************************************************************
622 Called to request everything to stop until all IO is completed.
623 ***********************************************************************/
625 static int aio_linux_suspend(struct vfs_handle_struct *handle,
626 struct files_struct *fsp,
627 const SMB_STRUCT_AIOCB * const aiocb_array[],
628 int n,
629 const struct timespec *timeout)
631 struct event_context *ev = NULL;
632 struct fd_event *sock_event = NULL;
633 int ret = -1;
634 struct suspend_private sp;
635 bool timed_out = false;
636 TALLOC_CTX *frame = talloc_stackframe();
638 /* This is a blocking call, and has to use a sub-event loop. */
639 ev = event_context_init(frame);
640 if (ev == NULL) {
641 errno = ENOMEM;
642 goto out;
645 if (timeout) {
646 struct timeval tv = convert_timespec_to_timeval(*timeout);
647 struct tevent_timer *te = tevent_add_timer(ev,
648 frame,
649 timeval_current_ofs(tv.tv_sec,
650 tv.tv_usec),
651 aio_linux_suspend_timed_out,
652 &timed_out);
653 if (!te) {
654 errno = ENOMEM;
655 goto out;
659 ZERO_STRUCT(sp);
660 sp.num_entries = n;
661 sp.aiocb_array = aiocb_array;
662 sp.num_finished = 0;
664 sock_event = tevent_add_fd(ev,
665 frame,
666 event_fd,
667 TEVENT_FD_READ,
668 aio_linux_handle_suspend_completion,
669 (void *)&sp);
670 if (sock_event == NULL) {
671 goto out;
674 * We're going to cheat here. We know that smbd/aio.c
675 * only calls this when it's waiting for every single
676 * outstanding call to finish on a close, so just wait
677 * individually for each IO to complete. We don't care
678 * what order they finish - only that they all do. JRA.
680 while (sp.num_entries != sp.num_finished) {
681 if (tevent_loop_once(ev) == -1) {
682 goto out;
685 if (timed_out) {
686 errno = EAGAIN;
687 goto out;
691 ret = 0;
693 out:
695 TALLOC_FREE(frame);
696 return ret;
699 static int aio_linux_connect(vfs_handle_struct *handle, const char *service,
700 const char *user)
702 /*********************************************************************
703 * How many io_events to initialize ?
704 * 128 per process seems insane as a default until you realize that
705 * (a) Throttling is done in SMB2 via the crediting algorithm.
706 * (b) SMB1 clients are limited to max_mux (50) outstanding
707 * requests and Windows clients don't use this anyway.
708 * Essentially we want this to be unlimited unless smb.conf
709 * says different.
710 *********************************************************************/
711 aio_pending_size = lp_parm_int(
712 SNUM(handle->conn), "aio_linux", "aio num events", 128);
713 return SMB_VFS_NEXT_CONNECT(handle, service, user);
716 static struct vfs_fn_pointers vfs_aio_linux_fns = {
717 .connect_fn = aio_linux_connect,
718 .aio_read_fn = aio_linux_read,
719 .aio_write_fn = aio_linux_write,
720 .aio_return_fn = aio_linux_return_fn,
721 .aio_cancel_fn = aio_linux_cancel,
722 .aio_error_fn = aio_linux_error_fn,
723 .aio_suspend_fn = aio_linux_suspend,
726 NTSTATUS vfs_aio_linux_init(void)
728 return smb_register_vfs(SMB_VFS_INTERFACE_VERSION,
729 "aio_linux", &vfs_aio_linux_fns);