2 * Simulate Posix AIO using Linux kernel AIO.
4 * Copyright (C) Jeremy Allison 2012
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 3 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
22 #include "system/filesys.h"
23 #include "smbd/smbd.h"
24 #include "smbd/globals.h"
25 #include <sys/eventfd.h>
29 static int event_fd
= -1;
30 static io_context_t io_ctx
;
31 static int aio_linux_requestid
;
32 static struct io_event
*io_recv_events
;
33 static struct fd_event
*aio_read_event
;
35 struct aio_private_data
{
36 struct aio_private_data
*prev
, *next
;
38 SMB_STRUCT_AIOCB
*aiocb
;
39 struct iocb
*event_iocb
;
45 /* List of outstanding requests we have. */
46 static struct aio_private_data
*pd_list
;
48 static void aio_linux_handle_completion(struct event_context
*event_ctx
,
49 struct fd_event
*event
,
53 /************************************************************************
54 Housekeeping. Cleanup if no activity for 30 seconds.
55 ***********************************************************************/
57 static void aio_linux_housekeeping(struct tevent_context
*event_ctx
,
58 struct tevent_timer
*te
,
62 /* Remove this timed event handler. */
65 if (pd_list
!= NULL
) {
66 /* Still busy. Look again in 30 seconds. */
67 (void)tevent_add_timer(event_ctx
,
69 timeval_current_ofs(30, 0),
70 aio_linux_housekeeping
,
75 /* No activity for 30 seconds. Close out kernel resources. */
76 io_queue_release(io_ctx
);
77 memset(&io_ctx
, '\0', sizeof(io_ctx
));
84 TALLOC_FREE(aio_read_event
);
85 TALLOC_FREE(io_recv_events
);
88 /************************************************************************
89 Ensure event fd and aio context are initialized.
90 ***********************************************************************/
92 static bool init_aio_linux(struct vfs_handle_struct
*handle
)
94 struct tevent_timer
*te
= NULL
;
97 /* Already initialized. */
101 /* Schedule a shutdown event for 30 seconds from now. */
102 te
= tevent_add_timer(server_event_context(),
104 timeval_current_ofs(30, 0),
105 aio_linux_housekeeping
,
112 /* Ensure we have enough space for aio_pending_size events. */
113 io_recv_events
= talloc_zero_array(NULL
,
116 if (io_recv_events
== NULL
) {
120 event_fd
= eventfd(0, EFD_NONBLOCK
| EFD_CLOEXEC
);
121 if (event_fd
== -1) {
125 aio_read_event
= tevent_add_fd(server_event_context(),
129 aio_linux_handle_completion
,
131 if (aio_read_event
== NULL
) {
135 if (io_queue_init(aio_pending_size
, &io_ctx
)) {
139 DEBUG(10,("init_aio_linux: initialized with up to %d events\n",
146 DEBUG(10,("init_aio_linux: initialization failed\n"));
149 TALLOC_FREE(io_recv_events
);
150 TALLOC_FREE(aio_read_event
);
151 if (event_fd
!= -1) {
155 memset(&io_ctx
, '\0', sizeof(io_ctx
));
159 /************************************************************************
160 Private data destructor.
161 ***********************************************************************/
163 static int pd_destructor(struct aio_private_data
*pd
)
165 DLIST_REMOVE(pd_list
, pd
);
169 /************************************************************************
170 Create and initialize a private data struct.
171 ***********************************************************************/
173 static struct aio_private_data
*create_private_data(TALLOC_CTX
*ctx
,
174 SMB_STRUCT_AIOCB
*aiocb
)
176 struct aio_private_data
*pd
= talloc_zero(ctx
, struct aio_private_data
);
180 pd
->event_iocb
= talloc_zero(pd
, struct iocb
);
181 pd
->requestid
= aio_linux_requestid
++;
184 pd
->ret_errno
= EINPROGRESS
;
185 talloc_set_destructor(pd
, pd_destructor
);
186 DLIST_ADD_END(pd_list
, pd
, struct aio_private_data
*);
190 /************************************************************************
191 Initiate an asynchronous pread call.
192 ***********************************************************************/
194 static int aio_linux_read(struct vfs_handle_struct
*handle
,
195 struct files_struct
*fsp
,
196 SMB_STRUCT_AIOCB
*aiocb
)
198 struct aio_extra
*aio_ex
= (struct aio_extra
*)aiocb
->aio_sigevent
.sigev_value
.sival_ptr
;
199 struct aio_private_data
*pd
= NULL
;
202 if (!init_aio_linux(handle
)) {
206 pd
= create_private_data(aio_ex
, aiocb
);
208 DEBUG(10, ("aio_linux_read: Could not create private data.\n"));
212 io_prep_pread(pd
->event_iocb
,
213 pd
->aiocb
->aio_fildes
,
214 discard_const(pd
->aiocb
->aio_buf
),
215 pd
->aiocb
->aio_nbytes
,
216 pd
->aiocb
->aio_offset
);
217 io_set_eventfd(pd
->event_iocb
, event_fd
);
218 /* Use the callback pointer as a private data ptr. */
219 io_set_callback(pd
->event_iocb
, (io_callback_t
)pd
);
221 ret
= io_submit(io_ctx
, 1, &pd
->event_iocb
);
227 DEBUG(10, ("aio_linux_read: requestid=%d read requested "
228 "of %llu bytes at offset %llu\n",
230 (unsigned long long)pd
->aiocb
->aio_nbytes
,
231 (unsigned long long)pd
->aiocb
->aio_offset
));
236 /************************************************************************
237 Initiate an asynchronous pwrite call.
238 ***********************************************************************/
240 static int aio_linux_write(struct vfs_handle_struct
*handle
,
241 struct files_struct
*fsp
,
242 SMB_STRUCT_AIOCB
*aiocb
)
244 struct aio_extra
*aio_ex
= (struct aio_extra
*)aiocb
->aio_sigevent
.sigev_value
.sival_ptr
;
245 struct aio_private_data
*pd
= NULL
;
248 if (!init_aio_linux(handle
)) {
252 pd
= create_private_data(aio_ex
, aiocb
);
254 DEBUG(10, ("aio_linux_write: Could not create private data.\n"));
258 io_prep_pwrite(pd
->event_iocb
,
259 pd
->aiocb
->aio_fildes
,
260 discard_const(pd
->aiocb
->aio_buf
),
261 pd
->aiocb
->aio_nbytes
,
262 pd
->aiocb
->aio_offset
);
263 io_set_eventfd(pd
->event_iocb
, event_fd
);
264 /* Use the callback pointer as a private data ptr. */
265 io_set_callback(pd
->event_iocb
, (io_callback_t
)pd
);
267 ret
= io_submit(io_ctx
, 1, &pd
->event_iocb
);
273 DEBUG(10, ("aio_linux_write: requestid=%d pwrite requested "
274 "of %llu bytes at offset %llu\n",
276 (unsigned long long)pd
->aiocb
->aio_nbytes
,
277 (unsigned long long)pd
->aiocb
->aio_offset
));
282 /************************************************************************
283 Save off the error / success conditions from the io_event.
284 Is idempotent (can be called multiple times given the same ioev).
285 ***********************************************************************/
287 static void aio_linux_setup_returns(struct io_event
*ioev
)
289 struct aio_private_data
*pd
= (struct aio_private_data
*)ioev
->data
;
291 /* ioev->res2 contains the -errno if error. */
292 /* ioev->res contains the number of bytes sent/received. */
295 pd
->ret_errno
= -ioev
->res2
;
297 pd
->ret_size
= ioev
->res
;
302 /************************************************************************
303 Handle a single finished io.
304 ***********************************************************************/
306 static void aio_linux_handle_io_finished(struct io_event
*ioev
)
308 struct aio_extra
*aio_ex
= NULL
;
309 struct aio_private_data
*pd
= (struct aio_private_data
*)ioev
->data
;
311 aio_linux_setup_returns(ioev
);
313 aio_ex
= (struct aio_extra
*)pd
->aiocb
->aio_sigevent
.sigev_value
.sival_ptr
;
314 smbd_aio_complete_aio_ex(aio_ex
);
316 DEBUG(10,("aio_linux_handle_io_finished: requestid %d completed\n",
321 /************************************************************************
322 Callback when multiple IOs complete.
323 ***********************************************************************/
325 static void aio_linux_handle_completion(struct event_context
*event_ctx
,
326 struct fd_event
*event
,
330 uint64_t num_events
= 0;
332 DEBUG(10, ("aio_linux_handle_completion called with flags=%d\n",
335 if ((flags
& EVENT_FD_READ
) == 0) {
339 /* Read the number of events available. */
340 if (sys_read(event_fd
, &num_events
, sizeof(num_events
)) !=
341 sizeof(num_events
)) {
342 smb_panic("aio_linux_handle_completion: invalid read");
345 while (num_events
> 0) {
346 uint64_t events_to_read
= MIN(num_events
, aio_pending_size
);
354 ret
= io_getevents(io_ctx
,
356 (long)events_to_read
,
362 DEBUG(1, ("aio_linux_handle_completion: "
363 "io_getevents error %s\n",
369 DEBUG(10, ("aio_linux_handle_completion: "
370 "io_getevents returned 0\n"));
374 /* ret is positive. */
375 for (i
= 0; i
< ret
; i
++) {
376 aio_linux_handle_io_finished(&io_recv_events
[i
]);
383 /************************************************************************
384 Find the private data by aiocb.
385 ***********************************************************************/
387 static struct aio_private_data
*find_private_data_by_aiocb(SMB_STRUCT_AIOCB
*aiocb
)
389 struct aio_private_data
*pd
;
391 for (pd
= pd_list
; pd
!= NULL
; pd
= pd
->next
) {
392 if (pd
->aiocb
== aiocb
) {
400 /************************************************************************
401 Called to return the result of a completed AIO.
402 Should only be called if aio_error returns something other than EINPROGRESS.
404 Any other value - return from IO operation.
405 ***********************************************************************/
407 static ssize_t
aio_linux_return_fn(struct vfs_handle_struct
*handle
,
408 struct files_struct
*fsp
,
409 SMB_STRUCT_AIOCB
*aiocb
)
411 struct aio_private_data
*pd
= find_private_data_by_aiocb(aiocb
);
415 DEBUG(0, ("aio_linux_return_fn: returning EINVAL\n"));
426 if (pd
->ret_size
== -1) {
427 errno
= pd
->ret_errno
;
433 /************************************************************************
434 Called to check the result of an AIO.
436 EINPROGRESS - still in progress.
437 EINVAL - invalid aiocb.
438 ECANCELED - request was cancelled.
439 0 - request completed successfully.
440 Any other value - errno from IO operation.
441 ***********************************************************************/
443 static int aio_linux_error_fn(struct vfs_handle_struct
*handle
,
444 struct files_struct
*fsp
,
445 SMB_STRUCT_AIOCB
*aiocb
)
447 struct aio_private_data
*pd
= find_private_data_by_aiocb(aiocb
);
455 return pd
->ret_errno
;
458 /************************************************************************
459 Called to request the cancel of an AIO, or all of them on a specific
460 fsp if aiocb == NULL.
461 ***********************************************************************/
463 static int aio_linux_cancel(struct vfs_handle_struct
*handle
,
464 struct files_struct
*fsp
,
465 SMB_STRUCT_AIOCB
*aiocb
)
467 struct aio_private_data
*pd
= NULL
;
469 for (pd
= pd_list
; pd
!= NULL
; pd
= pd
->next
) {
470 if (pd
->aiocb
== NULL
) {
473 if (pd
->aiocb
->aio_fildes
!= fsp
->fh
->fd
) {
476 if ((aiocb
!= NULL
) && (pd
->aiocb
!= aiocb
)) {
481 * We let the kernel do its job, but we discard the result when
482 * it's finished. NB. Should I call io_cancel here ?
485 pd
->cancelled
= true;
491 /************************************************************************
492 Callback for a previously detected job completion deferred to the main
494 ***********************************************************************/
496 static void aio_linux_handle_immediate(struct tevent_context
*ctx
,
497 struct tevent_immediate
*im
,
500 struct io_event
*ioev
= (struct io_event
*)private_data
;
502 aio_linux_handle_io_finished(ioev
);
506 /************************************************************************
507 Private data struct used in suspend completion code.
508 ***********************************************************************/
510 struct suspend_private
{
513 const SMB_STRUCT_AIOCB
* const *aiocb_array
;
516 /************************************************************************
517 Handle a single finished io from suspend.
518 ***********************************************************************/
520 static void aio_linux_handle_suspend_io_finished(struct suspend_private
*sp
,
521 struct io_event
*ioev
)
523 struct aio_private_data
*pd
= (struct aio_private_data
*)ioev
->data
;
524 struct io_event
*new_ioev
= NULL
;
525 struct tevent_immediate
*im
= NULL
;
528 /* Is this a requestid with an aiocb we're interested in ? */
529 for (i
= 0; i
< sp
->num_entries
; i
++) {
530 if (sp
->aiocb_array
[i
] == pd
->aiocb
) {
533 * We don't call aio_linux_handle_io_finished()
534 * here, but only the function that sets up the
535 * return values. This allows
536 * aio_linux_handle_io_finished() to be successfully
537 * called from smbd/aio.c:wait_for_aio_completion()
538 * once we return from here with all io's done.
540 aio_linux_setup_returns(ioev
);
545 /* Jobid completed we weren't waiting for.
546 We must reshedule this as an immediate event
547 on the main event context. */
548 im
= tevent_create_immediate(NULL
);
550 exit_server_cleanly("aio_linux_handle_suspend_completion: no memory");
553 new_ioev
= (struct io_event
*)talloc_memdup(NULL
,
555 sizeof(struct io_event
));
557 exit_server_cleanly("aio_linux_handle_suspend_completion: no memory");
560 DEBUG(10,("aio_linux_handle_suspend_completion: "
561 "re-scheduling requestid %d\n",
564 tevent_schedule_immediate(im
,
565 server_event_context(),
566 aio_linux_handle_immediate
,
570 /************************************************************************
571 Callback when an IO completes from a suspend call.
572 ***********************************************************************/
574 static void aio_linux_handle_suspend_completion(struct event_context
*event_ctx
,
575 struct fd_event
*event
,
579 struct suspend_private
*sp
= (struct suspend_private
*)p
;
580 uint64_t remaining_events
= sp
->num_entries
- sp
->num_finished
;
581 uint64_t num_events
= 0;
583 DEBUG(10, ("aio_linux_handle_suspend_completion called with flags=%d\n",
586 if ((flags
& EVENT_FD_READ
) == 0) {
590 /* Read the number of events available. */
591 if (sys_read(event_fd
, &num_events
, sizeof(num_events
)) !=
592 sizeof(num_events
)) {
593 smb_panic("aio_linux_handle_completion: invalid read");
596 while (num_events
> 0) {
597 uint64_t events_to_read
= MIN(num_events
, remaining_events
);
605 ret
= io_getevents(io_ctx
,
607 (long)events_to_read
,
613 DEBUG(1, ("aio_linux_handle_suspend_completion: "
614 "io_getevents error %s\n",
620 DEBUG(10, ("aio_linux_handle_suspend_completion: "
621 "io_getevents returned 0\n"));
625 /* ret is positive. */
626 for (i
= 0; i
< ret
; i
++) {
627 aio_linux_handle_suspend_io_finished(sp
,
635 static void aio_linux_suspend_timed_out(struct tevent_context
*event_ctx
,
636 struct tevent_timer
*te
,
640 bool *timed_out
= (bool *)private_data
;
641 /* Remove this timed event handler. */
646 /************************************************************************
647 Called to request everything to stop until all IO is completed.
648 ***********************************************************************/
650 static int aio_linux_suspend(struct vfs_handle_struct
*handle
,
651 struct files_struct
*fsp
,
652 const SMB_STRUCT_AIOCB
* const aiocb_array
[],
654 const struct timespec
*timeout
)
656 struct event_context
*ev
= NULL
;
657 struct fd_event
*sock_event
= NULL
;
659 struct suspend_private sp
;
660 bool timed_out
= false;
661 TALLOC_CTX
*frame
= talloc_stackframe();
663 /* This is a blocking call, and has to use a sub-event loop. */
664 ev
= event_context_init(frame
);
671 struct timeval tv
= convert_timespec_to_timeval(*timeout
);
672 struct tevent_timer
*te
= tevent_add_timer(ev
,
674 timeval_current_ofs(tv
.tv_sec
,
676 aio_linux_suspend_timed_out
,
686 sp
.aiocb_array
= aiocb_array
;
689 sock_event
= tevent_add_fd(ev
,
693 aio_linux_handle_suspend_completion
,
695 if (sock_event
== NULL
) {
699 * We're going to cheat here. We know that smbd/aio.c
700 * only calls this when it's waiting for every single
701 * outstanding call to finish on a close, so just wait
702 * individually for each IO to complete. We don't care
703 * what order they finish - only that they all do. JRA.
705 while (sp
.num_entries
!= sp
.num_finished
) {
706 if (tevent_loop_once(ev
) == -1) {
724 static int aio_linux_connect(vfs_handle_struct
*handle
, const char *service
,
727 /*********************************************************************
728 * How many io_events to initialize ?
729 * 128 per process seems insane as a default until you realize that
730 * (a) Throttling is done in SMB2 via the crediting algorithm.
731 * (b) SMB1 clients are limited to max_mux (50) outstanding
732 * requests and Windows clients don't use this anyway.
733 * Essentially we want this to be unlimited unless smb.conf
735 *********************************************************************/
736 aio_pending_size
= lp_parm_int(
737 SNUM(handle
->conn
), "aio_linux", "aio num events", 128);
738 return SMB_VFS_NEXT_CONNECT(handle
, service
, user
);
741 static struct vfs_fn_pointers vfs_aio_linux_fns
= {
742 .connect_fn
= aio_linux_connect
,
743 .aio_read_fn
= aio_linux_read
,
744 .aio_write_fn
= aio_linux_write
,
745 .aio_return_fn
= aio_linux_return_fn
,
746 .aio_cancel_fn
= aio_linux_cancel
,
747 .aio_error_fn
= aio_linux_error_fn
,
748 .aio_suspend_fn
= aio_linux_suspend
,
751 NTSTATUS
vfs_aio_linux_init(void)
753 return smb_register_vfs(SMB_VFS_INTERFACE_VERSION
,
754 "aio_linux", &vfs_aio_linux_fns
);