2 * Simulate Posix AIO using Linux kernel AIO.
4 * Copyright (C) Jeremy Allison 2012
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 3 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
22 #include "system/filesys.h"
23 #include "smbd/smbd.h"
24 #include "smbd/globals.h"
25 #include <sys/eventfd.h>
29 static int event_fd
= -1;
30 static io_context_t io_ctx
;
31 static int aio_linux_requestid
;
32 static struct io_event
*io_recv_events
;
33 static struct fd_event
*aio_read_event
;
35 struct aio_private_data
{
36 struct aio_private_data
*prev
, *next
;
38 SMB_STRUCT_AIOCB
*aiocb
;
39 struct iocb
*event_iocb
;
45 /* List of outstanding requests we have. */
46 static struct aio_private_data
*pd_list
;
48 static void aio_linux_handle_completion(struct event_context
*event_ctx
,
49 struct fd_event
*event
,
53 /************************************************************************
54 Housekeeping. Cleanup if no activity for 30 seconds.
55 ***********************************************************************/
57 static void aio_linux_housekeeping(struct tevent_context
*event_ctx
,
58 struct tevent_timer
*te
,
62 /* Remove this timed event handler. */
65 if (pd_list
!= NULL
) {
66 /* Still busy. Look again in 30 seconds. */
67 (void)tevent_add_timer(event_ctx
,
69 timeval_current_ofs(30, 0),
70 aio_linux_housekeeping
,
75 /* No activity for 30 seconds. Close out kernel resources. */
76 io_queue_release(io_ctx
);
77 memset(&io_ctx
, '\0', sizeof(io_ctx
));
84 TALLOC_FREE(aio_read_event
);
85 TALLOC_FREE(io_recv_events
);
88 /************************************************************************
89 Ensure event fd and aio context are initialized.
90 ***********************************************************************/
92 static bool init_aio_linux(struct vfs_handle_struct
*handle
)
94 struct tevent_timer
*te
= NULL
;
97 /* Already initialized. */
101 /* Schedule a shutdown event for 30 seconds from now. */
102 te
= tevent_add_timer(server_event_context(),
104 timeval_current_ofs(30, 0),
105 aio_linux_housekeeping
,
112 /* Ensure we have enough space for aio_pending_size events. */
113 io_recv_events
= talloc_zero_array(NULL
,
116 if (io_recv_events
== NULL
) {
120 event_fd
= eventfd(0, EFD_NONBLOCK
| EFD_CLOEXEC
);
121 if (event_fd
== -1) {
125 aio_read_event
= tevent_add_fd(server_event_context(),
129 aio_linux_handle_completion
,
131 if (aio_read_event
== NULL
) {
135 if (io_queue_init(aio_pending_size
, &io_ctx
)) {
139 DEBUG(10,("init_aio_linux: initialized with up to %d events\n",
146 DEBUG(10,("init_aio_linux: initialization failed\n"));
149 TALLOC_FREE(io_recv_events
);
150 TALLOC_FREE(aio_read_event
);
151 if (event_fd
!= -1) {
155 memset(&io_ctx
, '\0', sizeof(io_ctx
));
159 /************************************************************************
160 Private data destructor.
161 ***********************************************************************/
163 static int pd_destructor(struct aio_private_data
*pd
)
165 DLIST_REMOVE(pd_list
, pd
);
169 /************************************************************************
170 Create and initialize a private data struct.
171 ***********************************************************************/
173 static struct aio_private_data
*create_private_data(TALLOC_CTX
*ctx
,
174 SMB_STRUCT_AIOCB
*aiocb
)
176 struct aio_private_data
*pd
= talloc_zero(ctx
, struct aio_private_data
);
180 pd
->event_iocb
= talloc_zero(pd
, struct iocb
);
181 pd
->requestid
= aio_linux_requestid
++;
184 pd
->ret_errno
= EINPROGRESS
;
185 talloc_set_destructor(pd
, pd_destructor
);
186 DLIST_ADD_END(pd_list
, pd
, struct aio_private_data
*);
190 /************************************************************************
191 Initiate an asynchronous pread call.
192 ***********************************************************************/
194 static int aio_linux_read(struct vfs_handle_struct
*handle
,
195 struct files_struct
*fsp
,
196 SMB_STRUCT_AIOCB
*aiocb
)
198 struct aio_extra
*aio_ex
= (struct aio_extra
*)aiocb
->aio_sigevent
.sigev_value
.sival_ptr
;
199 struct aio_private_data
*pd
= NULL
;
202 if (!init_aio_linux(handle
)) {
206 pd
= create_private_data(aio_ex
, aiocb
);
208 DEBUG(10, ("aio_linux_read: Could not create private data.\n"));
212 io_prep_pread(pd
->event_iocb
,
213 pd
->aiocb
->aio_fildes
,
214 discard_const(pd
->aiocb
->aio_buf
),
215 pd
->aiocb
->aio_nbytes
,
216 pd
->aiocb
->aio_offset
);
217 io_set_eventfd(pd
->event_iocb
, event_fd
);
218 /* Use the callback pointer as a private data ptr. */
219 io_set_callback(pd
->event_iocb
, (io_callback_t
)pd
);
221 ret
= io_submit(io_ctx
, 1, &pd
->event_iocb
);
227 DEBUG(10, ("aio_linux_read: requestid=%d read requested "
228 "of %llu bytes at offset %llu\n",
230 (unsigned long long)pd
->aiocb
->aio_nbytes
,
231 (unsigned long long)pd
->aiocb
->aio_offset
));
236 /************************************************************************
237 Initiate an asynchronous pwrite call.
238 ***********************************************************************/
240 static int aio_linux_write(struct vfs_handle_struct
*handle
,
241 struct files_struct
*fsp
,
242 SMB_STRUCT_AIOCB
*aiocb
)
244 struct aio_extra
*aio_ex
= (struct aio_extra
*)aiocb
->aio_sigevent
.sigev_value
.sival_ptr
;
245 struct aio_private_data
*pd
= NULL
;
248 if (!init_aio_linux(handle
)) {
252 pd
= create_private_data(aio_ex
, aiocb
);
254 DEBUG(10, ("aio_linux_write: Could not create private data.\n"));
258 io_prep_pwrite(pd
->event_iocb
,
259 pd
->aiocb
->aio_fildes
,
260 discard_const(pd
->aiocb
->aio_buf
),
261 pd
->aiocb
->aio_nbytes
,
262 pd
->aiocb
->aio_offset
);
263 io_set_eventfd(pd
->event_iocb
, event_fd
);
264 /* Use the callback pointer as a private data ptr. */
265 io_set_callback(pd
->event_iocb
, (io_callback_t
)pd
);
267 ret
= io_submit(io_ctx
, 1, &pd
->event_iocb
);
273 DEBUG(10, ("aio_linux_write: requestid=%d pwrite requested "
274 "of %llu bytes at offset %llu\n",
276 (unsigned long long)pd
->aiocb
->aio_nbytes
,
277 (unsigned long long)pd
->aiocb
->aio_offset
));
282 /************************************************************************
283 Handle a single finished io.
284 ***********************************************************************/
286 static void aio_linux_handle_io_finished(struct io_event
*ioev
)
288 struct aio_extra
*aio_ex
= NULL
;
289 struct aio_private_data
*pd
= (struct aio_private_data
*)ioev
->data
;
291 /* ioev->res2 contains the -errno if error. */
292 /* ioev->res contains the number of bytes sent/received. */
295 pd
->ret_errno
= -ioev
->res2
;
297 pd
->ret_size
= ioev
->res
;
301 aio_ex
= (struct aio_extra
*)pd
->aiocb
->aio_sigevent
.sigev_value
.sival_ptr
;
302 smbd_aio_complete_aio_ex(aio_ex
);
304 DEBUG(10,("aio_linux_handle_io_finished: requestid %d completed\n",
309 /************************************************************************
310 Callback when multiple IOs complete.
311 ***********************************************************************/
313 static void aio_linux_handle_completion(struct event_context
*event_ctx
,
314 struct fd_event
*event
,
318 uint64_t num_events
= 0;
320 DEBUG(10, ("aio_linux_handle_completion called with flags=%d\n",
323 if ((flags
& EVENT_FD_READ
) == 0) {
327 /* Read the number of events available. */
328 if (sys_read(event_fd
, &num_events
, sizeof(num_events
)) !=
329 sizeof(num_events
)) {
330 smb_panic("aio_linux_handle_completion: invalid read");
333 while (num_events
> 0) {
334 uint64_t events_to_read
= MIN(num_events
, aio_pending_size
);
342 ret
= io_getevents(io_ctx
,
344 (long)events_to_read
,
350 DEBUG(1, ("aio_linux_handle_completion: "
351 "io_getevents error %s\n",
357 DEBUG(10, ("aio_linux_handle_completion: "
358 "io_getevents returned 0\n"));
362 /* ret is positive. */
363 for (i
= 0; i
< ret
; i
++) {
364 aio_linux_handle_io_finished(&io_recv_events
[i
]);
371 /************************************************************************
372 Find the private data by aiocb.
373 ***********************************************************************/
375 static struct aio_private_data
*find_private_data_by_aiocb(SMB_STRUCT_AIOCB
*aiocb
)
377 struct aio_private_data
*pd
;
379 for (pd
= pd_list
; pd
!= NULL
; pd
= pd
->next
) {
380 if (pd
->aiocb
== aiocb
) {
388 /************************************************************************
389 Called to return the result of a completed AIO.
390 Should only be called if aio_error returns something other than EINPROGRESS.
392 Any other value - return from IO operation.
393 ***********************************************************************/
395 static ssize_t
aio_linux_return_fn(struct vfs_handle_struct
*handle
,
396 struct files_struct
*fsp
,
397 SMB_STRUCT_AIOCB
*aiocb
)
399 struct aio_private_data
*pd
= find_private_data_by_aiocb(aiocb
);
403 DEBUG(0, ("aio_linux_return_fn: returning EINVAL\n"));
409 if (pd
->ret_size
== -1) {
410 errno
= pd
->ret_errno
;
416 /************************************************************************
417 Called to check the result of an AIO.
419 EINPROGRESS - still in progress.
420 EINVAL - invalid aiocb.
421 ECANCELED - request was cancelled.
422 0 - request completed successfully.
423 Any other value - errno from IO operation.
424 ***********************************************************************/
426 static int aio_linux_error_fn(struct vfs_handle_struct
*handle
,
427 struct files_struct
*fsp
,
428 SMB_STRUCT_AIOCB
*aiocb
)
430 struct aio_private_data
*pd
= find_private_data_by_aiocb(aiocb
);
438 return pd
->ret_errno
;
441 /************************************************************************
442 Called to request the cancel of an AIO, or all of them on a specific
443 fsp if aiocb == NULL.
444 ***********************************************************************/
446 static int aio_linux_cancel(struct vfs_handle_struct
*handle
,
447 struct files_struct
*fsp
,
448 SMB_STRUCT_AIOCB
*aiocb
)
450 struct aio_private_data
*pd
= NULL
;
452 for (pd
= pd_list
; pd
!= NULL
; pd
= pd
->next
) {
453 if (pd
->aiocb
== NULL
) {
456 if (pd
->aiocb
->aio_fildes
!= fsp
->fh
->fd
) {
459 if ((aiocb
!= NULL
) && (pd
->aiocb
!= aiocb
)) {
464 * We let the kernel do its job, but we discard the result when
465 * it's finished. NB. Should I call io_cancel here ?
468 pd
->cancelled
= true;
474 /************************************************************************
475 Callback for a previously detected job completion deferred to the main
477 ***********************************************************************/
479 static void aio_linux_handle_immediate(struct tevent_context
*ctx
,
480 struct tevent_immediate
*im
,
483 struct io_event
*ioev
= (struct io_event
*)private_data
;
485 aio_linux_handle_io_finished(ioev
);
489 /************************************************************************
490 Private data struct used in suspend completion code.
491 ***********************************************************************/
493 struct suspend_private
{
496 const SMB_STRUCT_AIOCB
* const *aiocb_array
;
499 /************************************************************************
500 Handle a single finished io from suspend.
501 ***********************************************************************/
503 static void aio_linux_handle_suspend_io_finished(struct suspend_private
*sp
,
504 struct io_event
*ioev
)
506 struct aio_private_data
*pd
= (struct aio_private_data
*)ioev
->data
;
507 struct io_event
*new_ioev
= NULL
;
508 struct tevent_immediate
*im
= NULL
;
511 /* Is this a requestid with an aiocb we're interested in ? */
512 for (i
= 0; i
< sp
->num_entries
; i
++) {
513 if (sp
->aiocb_array
[i
] == pd
->aiocb
) {
515 aio_linux_handle_io_finished(ioev
);
520 /* Jobid completed we weren't waiting for.
521 We must reshedule this as an immediate event
522 on the main event context. */
523 im
= tevent_create_immediate(NULL
);
525 exit_server_cleanly("aio_linux_handle_suspend_completion: no memory");
528 new_ioev
= (struct io_event
*)talloc_memdup(NULL
,
530 sizeof(struct io_event
));
532 exit_server_cleanly("aio_linux_handle_suspend_completion: no memory");
535 DEBUG(10,("aio_linux_handle_suspend_completion: "
536 "re-scheduling requestid %d\n",
539 tevent_schedule_immediate(im
,
540 server_event_context(),
541 aio_linux_handle_immediate
,
545 /************************************************************************
546 Callback when an IO completes from a suspend call.
547 ***********************************************************************/
549 static void aio_linux_handle_suspend_completion(struct event_context
*event_ctx
,
550 struct fd_event
*event
,
554 struct suspend_private
*sp
= (struct suspend_private
*)p
;
555 uint64_t remaining_events
= sp
->num_entries
- sp
->num_finished
;
556 uint64_t num_events
= 0;
558 DEBUG(10, ("aio_linux_handle_suspend_completion called with flags=%d\n",
561 if ((flags
& EVENT_FD_READ
) == 0) {
565 /* Read the number of events available. */
566 if (sys_read(event_fd
, &num_events
, sizeof(num_events
)) !=
567 sizeof(num_events
)) {
568 smb_panic("aio_linux_handle_completion: invalid read");
571 while (num_events
> 0) {
572 uint64_t events_to_read
= MIN(num_events
, remaining_events
);
580 ret
= io_getevents(io_ctx
,
582 (long)events_to_read
,
588 DEBUG(1, ("aio_linux_handle_suspend_completion: "
589 "io_getevents error %s\n",
595 DEBUG(10, ("aio_linux_handle_suspend_completion: "
596 "io_getevents returned 0\n"));
600 /* ret is positive. */
601 for (i
= 0; i
< ret
; i
++) {
602 aio_linux_handle_suspend_io_finished(sp
,
610 static void aio_linux_suspend_timed_out(struct tevent_context
*event_ctx
,
611 struct tevent_timer
*te
,
615 bool *timed_out
= (bool *)private_data
;
616 /* Remove this timed event handler. */
621 /************************************************************************
622 Called to request everything to stop until all IO is completed.
623 ***********************************************************************/
625 static int aio_linux_suspend(struct vfs_handle_struct
*handle
,
626 struct files_struct
*fsp
,
627 const SMB_STRUCT_AIOCB
* const aiocb_array
[],
629 const struct timespec
*timeout
)
631 struct event_context
*ev
= NULL
;
632 struct fd_event
*sock_event
= NULL
;
634 struct suspend_private sp
;
635 bool timed_out
= false;
636 TALLOC_CTX
*frame
= talloc_stackframe();
638 /* This is a blocking call, and has to use a sub-event loop. */
639 ev
= event_context_init(frame
);
646 struct timeval tv
= convert_timespec_to_timeval(*timeout
);
647 struct tevent_timer
*te
= tevent_add_timer(ev
,
649 timeval_current_ofs(tv
.tv_sec
,
651 aio_linux_suspend_timed_out
,
661 sp
.aiocb_array
= aiocb_array
;
664 sock_event
= tevent_add_fd(ev
,
668 aio_linux_handle_suspend_completion
,
670 if (sock_event
== NULL
) {
674 * We're going to cheat here. We know that smbd/aio.c
675 * only calls this when it's waiting for every single
676 * outstanding call to finish on a close, so just wait
677 * individually for each IO to complete. We don't care
678 * what order they finish - only that they all do. JRA.
680 while (sp
.num_entries
!= sp
.num_finished
) {
681 if (tevent_loop_once(ev
) == -1) {
699 static int aio_linux_connect(vfs_handle_struct
*handle
, const char *service
,
702 /*********************************************************************
703 * How many io_events to initialize ?
704 * 128 per process seems insane as a default until you realize that
705 * (a) Throttling is done in SMB2 via the crediting algorithm.
706 * (b) SMB1 clients are limited to max_mux (50) outstanding
707 * requests and Windows clients don't use this anyway.
708 * Essentially we want this to be unlimited unless smb.conf
710 *********************************************************************/
711 aio_pending_size
= lp_parm_int(
712 SNUM(handle
->conn
), "aio_linux", "aio num events", 128);
713 return SMB_VFS_NEXT_CONNECT(handle
, service
, user
);
716 static struct vfs_fn_pointers vfs_aio_linux_fns
= {
717 .connect_fn
= aio_linux_connect
,
718 .aio_read_fn
= aio_linux_read
,
719 .aio_write_fn
= aio_linux_write
,
720 .aio_return_fn
= aio_linux_return_fn
,
721 .aio_cancel_fn
= aio_linux_cancel
,
722 .aio_error_fn
= aio_linux_error_fn
,
723 .aio_suspend_fn
= aio_linux_suspend
,
726 NTSTATUS
vfs_aio_linux_init(void)
728 return smb_register_vfs(SMB_VFS_INTERFACE_VERSION
,
729 "aio_linux", &vfs_aio_linux_fns
);