s3:smb2_flush: make use of file_fsp_smb2()
[Samba/gebeck_regimport.git] / source3 / modules / vfs_aio_linux.c
blobd152f3550188137658b87e75f1e4e71a8b7054e2
1 /*
2 * Simulate Posix AIO using Linux kernel AIO.
4 * Copyright (C) Jeremy Allison 2012
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 3 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21 #include "includes.h"
22 #include "system/filesys.h"
23 #include "smbd/smbd.h"
24 #include "smbd/globals.h"
25 #include <sys/eventfd.h>
26 #include <libaio.h>
28 struct aio_extra;
29 static int event_fd = -1;
30 static io_context_t io_ctx;
31 static int aio_linux_requestid;
32 static struct io_event *io_recv_events;
33 static struct fd_event *aio_read_event;
35 struct aio_private_data {
36 struct aio_private_data *prev, *next;
37 int requestid;
38 SMB_STRUCT_AIOCB *aiocb;
39 struct iocb *event_iocb;
40 ssize_t ret_size;
41 int ret_errno;
42 bool cancelled;
45 /* List of outstanding requests we have. */
46 static struct aio_private_data *pd_list;
48 static void aio_linux_handle_completion(struct event_context *event_ctx,
49 struct fd_event *event,
50 uint16 flags,
51 void *p);
53 /************************************************************************
54 Housekeeping. Cleanup if no activity for 30 seconds.
55 ***********************************************************************/
57 static void aio_linux_housekeeping(struct tevent_context *event_ctx,
58 struct tevent_timer *te,
59 struct timeval now,
60 void *private_data)
62 /* Remove this timed event handler. */
63 TALLOC_FREE(te);
65 if (pd_list != NULL) {
66 /* Still busy. Look again in 30 seconds. */
67 (void)tevent_add_timer(event_ctx,
68 NULL,
69 timeval_current_ofs(30, 0),
70 aio_linux_housekeeping,
71 NULL);
72 return;
75 /* No activity for 30 seconds. Close out kernel resources. */
76 io_queue_release(io_ctx);
77 memset(&io_ctx, '\0', sizeof(io_ctx));
79 if (event_fd != -1) {
80 close(event_fd);
81 event_fd = -1;
84 TALLOC_FREE(aio_read_event);
85 TALLOC_FREE(io_recv_events);
88 /************************************************************************
89 Ensure event fd and aio context are initialized.
90 ***********************************************************************/
92 static bool init_aio_linux(struct vfs_handle_struct *handle)
94 struct tevent_timer *te = NULL;
96 if (event_fd != -1) {
97 /* Already initialized. */
98 return true;
101 /* Schedule a shutdown event for 30 seconds from now. */
102 te = tevent_add_timer(server_event_context(),
103 NULL,
104 timeval_current_ofs(30, 0),
105 aio_linux_housekeeping,
106 NULL);
108 if (te == NULL) {
109 goto fail;
112 /* Ensure we have enough space for aio_pending_size events. */
113 io_recv_events = talloc_zero_array(NULL,
114 struct io_event,
115 aio_pending_size);
116 if (io_recv_events == NULL) {
117 goto fail;
120 event_fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
121 if (event_fd == -1) {
122 goto fail;
125 aio_read_event = tevent_add_fd(server_event_context(),
126 NULL,
127 event_fd,
128 TEVENT_FD_READ,
129 aio_linux_handle_completion,
130 NULL);
131 if (aio_read_event == NULL) {
132 goto fail;
135 if (io_queue_init(aio_pending_size, &io_ctx)) {
136 goto fail;
139 DEBUG(10,("init_aio_linux: initialized with up to %d events\n",
140 aio_pending_size));
142 return true;
144 fail:
146 DEBUG(10,("init_aio_linux: initialization failed\n"));
148 TALLOC_FREE(te);
149 TALLOC_FREE(io_recv_events);
150 TALLOC_FREE(aio_read_event);
151 if (event_fd != -1) {
152 close(event_fd);
153 event_fd = -1;
155 memset(&io_ctx, '\0', sizeof(io_ctx));
156 return false;
159 /************************************************************************
160 Private data destructor.
161 ***********************************************************************/
163 static int pd_destructor(struct aio_private_data *pd)
165 DLIST_REMOVE(pd_list, pd);
166 return 0;
169 /************************************************************************
170 Create and initialize a private data struct.
171 ***********************************************************************/
173 static struct aio_private_data *create_private_data(TALLOC_CTX *ctx,
174 SMB_STRUCT_AIOCB *aiocb)
176 struct aio_private_data *pd = talloc_zero(ctx, struct aio_private_data);
177 if (!pd) {
178 return NULL;
180 pd->event_iocb = talloc_zero(pd, struct iocb);
181 pd->requestid = aio_linux_requestid++;
182 pd->aiocb = aiocb;
183 pd->ret_size = -1;
184 pd->ret_errno = EINPROGRESS;
185 talloc_set_destructor(pd, pd_destructor);
186 DLIST_ADD_END(pd_list, pd, struct aio_private_data *);
187 return pd;
190 /************************************************************************
191 Initiate an asynchronous pread call.
192 ***********************************************************************/
194 static int aio_linux_read(struct vfs_handle_struct *handle,
195 struct files_struct *fsp,
196 SMB_STRUCT_AIOCB *aiocb)
198 struct aio_extra *aio_ex = (struct aio_extra *)aiocb->aio_sigevent.sigev_value.sival_ptr;
199 struct aio_private_data *pd = NULL;
200 int ret;
202 if (!init_aio_linux(handle)) {
203 return -1;
206 pd = create_private_data(aio_ex, aiocb);
207 if (pd == NULL) {
208 DEBUG(10, ("aio_linux_read: Could not create private data.\n"));
209 return -1;
212 io_prep_pread(pd->event_iocb,
213 pd->aiocb->aio_fildes,
214 discard_const(pd->aiocb->aio_buf),
215 pd->aiocb->aio_nbytes,
216 pd->aiocb->aio_offset);
217 io_set_eventfd(pd->event_iocb, event_fd);
218 /* Use the callback pointer as a private data ptr. */
219 io_set_callback(pd->event_iocb, (io_callback_t)pd);
221 ret = io_submit(io_ctx, 1, &pd->event_iocb);
222 if (ret < 0) {
223 errno = ret;
224 return -1;
227 DEBUG(10, ("aio_linux_read: requestid=%d read requested "
228 "of %llu bytes at offset %llu\n",
229 pd->requestid,
230 (unsigned long long)pd->aiocb->aio_nbytes,
231 (unsigned long long)pd->aiocb->aio_offset));
233 return 0;
236 /************************************************************************
237 Initiate an asynchronous pwrite call.
238 ***********************************************************************/
240 static int aio_linux_write(struct vfs_handle_struct *handle,
241 struct files_struct *fsp,
242 SMB_STRUCT_AIOCB *aiocb)
244 struct aio_extra *aio_ex = (struct aio_extra *)aiocb->aio_sigevent.sigev_value.sival_ptr;
245 struct aio_private_data *pd = NULL;
246 int ret;
248 if (!init_aio_linux(handle)) {
249 return -1;
252 pd = create_private_data(aio_ex, aiocb);
253 if (pd == NULL) {
254 DEBUG(10, ("aio_linux_write: Could not create private data.\n"));
255 return -1;
258 io_prep_pwrite(pd->event_iocb,
259 pd->aiocb->aio_fildes,
260 discard_const(pd->aiocb->aio_buf),
261 pd->aiocb->aio_nbytes,
262 pd->aiocb->aio_offset);
263 io_set_eventfd(pd->event_iocb, event_fd);
264 /* Use the callback pointer as a private data ptr. */
265 io_set_callback(pd->event_iocb, (io_callback_t)pd);
267 ret = io_submit(io_ctx, 1, &pd->event_iocb);
268 if (ret < 0) {
269 errno = ret;
270 return -1;
273 DEBUG(10, ("aio_linux_write: requestid=%d pwrite requested "
274 "of %llu bytes at offset %llu\n",
275 pd->requestid,
276 (unsigned long long)pd->aiocb->aio_nbytes,
277 (unsigned long long)pd->aiocb->aio_offset));
279 return 0;
282 /************************************************************************
283 Save off the error / success conditions from the io_event.
284 Is idempotent (can be called multiple times given the same ioev).
285 ***********************************************************************/
287 static void aio_linux_setup_returns(struct io_event *ioev)
289 struct aio_private_data *pd = (struct aio_private_data *)ioev->data;
291 /* ioev->res2 contains the -errno if error. */
292 /* ioev->res contains the number of bytes sent/received. */
293 if (ioev->res2) {
294 pd->ret_size = -1;
295 pd->ret_errno = -ioev->res2;
296 } else {
297 pd->ret_size = ioev->res;
298 pd->ret_errno = 0;
302 /************************************************************************
303 Handle a single finished io.
304 ***********************************************************************/
306 static void aio_linux_handle_io_finished(struct io_event *ioev)
308 struct aio_extra *aio_ex = NULL;
309 struct aio_private_data *pd = (struct aio_private_data *)ioev->data;
311 aio_linux_setup_returns(ioev);
313 aio_ex = (struct aio_extra *)pd->aiocb->aio_sigevent.sigev_value.sival_ptr;
314 smbd_aio_complete_aio_ex(aio_ex);
316 DEBUG(10,("aio_linux_handle_io_finished: requestid %d completed\n",
317 pd->requestid ));
318 TALLOC_FREE(aio_ex);
321 /************************************************************************
322 Callback when multiple IOs complete.
323 ***********************************************************************/
325 static void aio_linux_handle_completion(struct event_context *event_ctx,
326 struct fd_event *event,
327 uint16 flags,
328 void *p)
330 uint64_t num_events = 0;
332 DEBUG(10, ("aio_linux_handle_completion called with flags=%d\n",
333 (int)flags));
335 if ((flags & EVENT_FD_READ) == 0) {
336 return;
339 /* Read the number of events available. */
340 if (sys_read(event_fd, &num_events, sizeof(num_events)) !=
341 sizeof(num_events)) {
342 smb_panic("aio_linux_handle_completion: invalid read");
345 while (num_events > 0) {
346 uint64_t events_to_read = MIN(num_events, aio_pending_size);
347 struct timespec ts;
348 int i;
349 int ret;
351 ts.tv_sec = 0;
352 ts.tv_nsec = 0;
354 ret = io_getevents(io_ctx,
356 (long)events_to_read,
357 io_recv_events,
358 &ts);
360 if (ret < 0) {
361 errno = -ret;
362 DEBUG(1, ("aio_linux_handle_completion: "
363 "io_getevents error %s\n",
364 strerror(errno) ));
365 return;
368 if (ret == 0) {
369 DEBUG(10, ("aio_linux_handle_completion: "
370 "io_getevents returned 0\n"));
371 continue;
374 /* ret is positive. */
375 for (i = 0; i < ret; i++) {
376 aio_linux_handle_io_finished(&io_recv_events[i]);
379 num_events -= ret;
383 /************************************************************************
384 Find the private data by aiocb.
385 ***********************************************************************/
387 static struct aio_private_data *find_private_data_by_aiocb(SMB_STRUCT_AIOCB *aiocb)
389 struct aio_private_data *pd;
391 for (pd = pd_list; pd != NULL; pd = pd->next) {
392 if (pd->aiocb == aiocb) {
393 return pd;
397 return NULL;
400 /************************************************************************
401 Called to return the result of a completed AIO.
402 Should only be called if aio_error returns something other than EINPROGRESS.
403 Returns:
404 Any other value - return from IO operation.
405 ***********************************************************************/
407 static ssize_t aio_linux_return_fn(struct vfs_handle_struct *handle,
408 struct files_struct *fsp,
409 SMB_STRUCT_AIOCB *aiocb)
411 struct aio_private_data *pd = find_private_data_by_aiocb(aiocb);
413 if (pd == NULL) {
414 errno = EINVAL;
415 DEBUG(0, ("aio_linux_return_fn: returning EINVAL\n"));
416 return -1;
419 pd->aiocb = NULL;
421 if (pd->cancelled) {
422 errno = ECANCELED;
423 return -1;
426 if (pd->ret_size == -1) {
427 errno = pd->ret_errno;
430 return pd->ret_size;
433 /************************************************************************
434 Called to check the result of an AIO.
435 Returns:
436 EINPROGRESS - still in progress.
437 EINVAL - invalid aiocb.
438 ECANCELED - request was cancelled.
439 0 - request completed successfully.
440 Any other value - errno from IO operation.
441 ***********************************************************************/
443 static int aio_linux_error_fn(struct vfs_handle_struct *handle,
444 struct files_struct *fsp,
445 SMB_STRUCT_AIOCB *aiocb)
447 struct aio_private_data *pd = find_private_data_by_aiocb(aiocb);
449 if (pd == NULL) {
450 return EINVAL;
452 if (pd->cancelled) {
453 return ECANCELED;
455 return pd->ret_errno;
458 /************************************************************************
459 Called to request the cancel of an AIO, or all of them on a specific
460 fsp if aiocb == NULL.
461 ***********************************************************************/
463 static int aio_linux_cancel(struct vfs_handle_struct *handle,
464 struct files_struct *fsp,
465 SMB_STRUCT_AIOCB *aiocb)
467 struct aio_private_data *pd = NULL;
469 for (pd = pd_list; pd != NULL; pd = pd->next) {
470 if (pd->aiocb == NULL) {
471 continue;
473 if (pd->aiocb->aio_fildes != fsp->fh->fd) {
474 continue;
476 if ((aiocb != NULL) && (pd->aiocb != aiocb)) {
477 continue;
481 * We let the kernel do its job, but we discard the result when
482 * it's finished. NB. Should I call io_cancel here ?
485 pd->cancelled = true;
488 return AIO_CANCELED;
491 /************************************************************************
492 Callback for a previously detected job completion deferred to the main
493 loop.
494 ***********************************************************************/
496 static void aio_linux_handle_immediate(struct tevent_context *ctx,
497 struct tevent_immediate *im,
498 void *private_data)
500 struct io_event *ioev = (struct io_event *)private_data;
502 aio_linux_handle_io_finished(ioev);
503 TALLOC_FREE(ioev);
506 /************************************************************************
507 Private data struct used in suspend completion code.
508 ***********************************************************************/
510 struct suspend_private {
511 int num_entries;
512 int num_finished;
513 const SMB_STRUCT_AIOCB * const *aiocb_array;
516 /************************************************************************
517 Handle a single finished io from suspend.
518 ***********************************************************************/
520 static void aio_linux_handle_suspend_io_finished(struct suspend_private *sp,
521 struct io_event *ioev)
523 struct aio_private_data *pd = (struct aio_private_data *)ioev->data;
524 struct io_event *new_ioev = NULL;
525 struct tevent_immediate *im = NULL;
526 int i;
528 /* Is this a requestid with an aiocb we're interested in ? */
529 for (i = 0; i < sp->num_entries; i++) {
530 if (sp->aiocb_array[i] == pd->aiocb) {
531 sp->num_finished++;
533 * We don't call aio_linux_handle_io_finished()
534 * here, but only the function that sets up the
535 * return values. This allows
536 * aio_linux_handle_io_finished() to be successfully
537 * called from smbd/aio.c:wait_for_aio_completion()
538 * once we return from here with all io's done.
540 aio_linux_setup_returns(ioev);
541 return;
545 /* Jobid completed we weren't waiting for.
546 We must reshedule this as an immediate event
547 on the main event context. */
548 im = tevent_create_immediate(NULL);
549 if (!im) {
550 exit_server_cleanly("aio_linux_handle_suspend_completion: no memory");
553 new_ioev = (struct io_event *)talloc_memdup(NULL,
554 ioev,
555 sizeof(struct io_event));
556 if (!new_ioev) {
557 exit_server_cleanly("aio_linux_handle_suspend_completion: no memory");
560 DEBUG(10,("aio_linux_handle_suspend_completion: "
561 "re-scheduling requestid %d\n",
562 pd->requestid));
564 tevent_schedule_immediate(im,
565 server_event_context(),
566 aio_linux_handle_immediate,
567 (void *)new_ioev);
570 /************************************************************************
571 Callback when an IO completes from a suspend call.
572 ***********************************************************************/
574 static void aio_linux_handle_suspend_completion(struct event_context *event_ctx,
575 struct fd_event *event,
576 uint16 flags,
577 void *p)
579 struct suspend_private *sp = (struct suspend_private *)p;
580 uint64_t remaining_events = sp->num_entries - sp->num_finished;
581 uint64_t num_events = 0;
583 DEBUG(10, ("aio_linux_handle_suspend_completion called with flags=%d\n",
584 (int)flags));
586 if ((flags & EVENT_FD_READ) == 0) {
587 return;
590 /* Read the number of events available. */
591 if (sys_read(event_fd, &num_events, sizeof(num_events)) !=
592 sizeof(num_events)) {
593 smb_panic("aio_linux_handle_completion: invalid read");
596 while (num_events > 0) {
597 uint64_t events_to_read = MIN(num_events, remaining_events);
598 struct timespec ts;
599 int i;
600 int ret;
602 ts.tv_sec = 0;
603 ts.tv_nsec = 0;
605 ret = io_getevents(io_ctx,
607 (long)events_to_read,
608 io_recv_events,
609 &ts);
611 if (ret < 0) {
612 errno = -ret;
613 DEBUG(1, ("aio_linux_handle_suspend_completion: "
614 "io_getevents error %s\n",
615 strerror(errno) ));
616 return;
619 if (ret == 0) {
620 DEBUG(10, ("aio_linux_handle_suspend_completion: "
621 "io_getevents returned 0\n"));
622 continue;
625 /* ret is positive. */
626 for (i = 0; i < ret; i++) {
627 aio_linux_handle_suspend_io_finished(sp,
628 &io_recv_events[i]);
631 num_events -= ret;
635 static void aio_linux_suspend_timed_out(struct tevent_context *event_ctx,
636 struct tevent_timer *te,
637 struct timeval now,
638 void *private_data)
640 bool *timed_out = (bool *)private_data;
641 /* Remove this timed event handler. */
642 TALLOC_FREE(te);
643 *timed_out = true;
646 /************************************************************************
647 Called to request everything to stop until all IO is completed.
648 ***********************************************************************/
650 static int aio_linux_suspend(struct vfs_handle_struct *handle,
651 struct files_struct *fsp,
652 const SMB_STRUCT_AIOCB * const aiocb_array[],
653 int n,
654 const struct timespec *timeout)
656 struct event_context *ev = NULL;
657 struct fd_event *sock_event = NULL;
658 int ret = -1;
659 struct suspend_private sp;
660 bool timed_out = false;
661 TALLOC_CTX *frame = talloc_stackframe();
663 /* This is a blocking call, and has to use a sub-event loop. */
664 ev = event_context_init(frame);
665 if (ev == NULL) {
666 errno = ENOMEM;
667 goto out;
670 if (timeout) {
671 struct timeval tv = convert_timespec_to_timeval(*timeout);
672 struct tevent_timer *te = tevent_add_timer(ev,
673 frame,
674 timeval_current_ofs(tv.tv_sec,
675 tv.tv_usec),
676 aio_linux_suspend_timed_out,
677 &timed_out);
678 if (!te) {
679 errno = ENOMEM;
680 goto out;
684 ZERO_STRUCT(sp);
685 sp.num_entries = n;
686 sp.aiocb_array = aiocb_array;
687 sp.num_finished = 0;
689 sock_event = tevent_add_fd(ev,
690 frame,
691 event_fd,
692 TEVENT_FD_READ,
693 aio_linux_handle_suspend_completion,
694 (void *)&sp);
695 if (sock_event == NULL) {
696 goto out;
699 * We're going to cheat here. We know that smbd/aio.c
700 * only calls this when it's waiting for every single
701 * outstanding call to finish on a close, so just wait
702 * individually for each IO to complete. We don't care
703 * what order they finish - only that they all do. JRA.
705 while (sp.num_entries != sp.num_finished) {
706 if (tevent_loop_once(ev) == -1) {
707 goto out;
710 if (timed_out) {
711 errno = EAGAIN;
712 goto out;
716 ret = 0;
718 out:
720 TALLOC_FREE(frame);
721 return ret;
724 static int aio_linux_connect(vfs_handle_struct *handle, const char *service,
725 const char *user)
727 /*********************************************************************
728 * How many io_events to initialize ?
729 * 128 per process seems insane as a default until you realize that
730 * (a) Throttling is done in SMB2 via the crediting algorithm.
731 * (b) SMB1 clients are limited to max_mux (50) outstanding
732 * requests and Windows clients don't use this anyway.
733 * Essentially we want this to be unlimited unless smb.conf
734 * says different.
735 *********************************************************************/
736 aio_pending_size = lp_parm_int(
737 SNUM(handle->conn), "aio_linux", "aio num events", 128);
738 return SMB_VFS_NEXT_CONNECT(handle, service, user);
741 static struct vfs_fn_pointers vfs_aio_linux_fns = {
742 .connect_fn = aio_linux_connect,
743 .aio_read_fn = aio_linux_read,
744 .aio_write_fn = aio_linux_write,
745 .aio_return_fn = aio_linux_return_fn,
746 .aio_cancel_fn = aio_linux_cancel,
747 .aio_error_fn = aio_linux_error_fn,
748 .aio_suspend_fn = aio_linux_suspend,
751 NTSTATUS vfs_aio_linux_init(void)
753 return smb_register_vfs(SMB_VFS_INTERFACE_VERSION,
754 "aio_linux", &vfs_aio_linux_fns);