2 * Simulate Posix AIO using Linux kernel AIO.
4 * Copyright (C) Jeremy Allison 2012
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 3 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
22 #include "system/filesys.h"
23 #include "smbd/smbd.h"
24 #include "smbd/globals.h"
25 #include <sys/eventfd.h>
29 static int event_fd
= -1;
30 static io_context_t io_ctx
;
31 static int aio_linux_requestid
;
32 static struct io_event
*io_recv_events
;
33 static struct fd_event
*aio_read_event
;
35 struct aio_private_data
{
36 struct aio_private_data
*prev
, *next
;
38 SMB_STRUCT_AIOCB
*aiocb
;
39 struct iocb
*event_iocb
;
45 /* List of outstanding requests we have. */
46 static struct aio_private_data
*pd_list
;
48 static void aio_linux_handle_completion(struct event_context
*event_ctx
,
49 struct fd_event
*event
,
53 /************************************************************************
54 Housekeeping. Cleanup if no activity for 30 seconds.
55 ***********************************************************************/
57 static void aio_linux_housekeeping(struct tevent_context
*event_ctx
,
58 struct tevent_timer
*te
,
62 /* Remove this timed event handler. */
65 if (pd_list
!= NULL
) {
66 /* Still busy. Look again in 30 seconds. */
67 (void)tevent_add_timer(event_ctx
,
69 timeval_current_ofs(30, 0),
70 aio_linux_housekeeping
,
75 /* No activity for 30 seconds. Close out kernel resources. */
76 io_queue_release(io_ctx
);
77 memset(&io_ctx
, '\0', sizeof(io_ctx
));
84 TALLOC_FREE(aio_read_event
);
85 TALLOC_FREE(io_recv_events
);
88 /************************************************************************
89 Ensure event fd and aio context are initialized.
90 ***********************************************************************/
92 static bool init_aio_linux(struct vfs_handle_struct
*handle
)
94 struct tevent_timer
*te
= NULL
;
97 /* Already initialized. */
101 /* Schedule a shutdown event for 30 seconds from now. */
102 te
= tevent_add_timer(server_event_context(),
104 timeval_current_ofs(30, 0),
105 aio_linux_housekeeping
,
112 /* Ensure we have enough space for aio_pending_size events. */
113 io_recv_events
= talloc_zero_array(NULL
,
116 if (io_recv_events
== NULL
) {
120 event_fd
= eventfd(0, EFD_NONBLOCK
| EFD_CLOEXEC
);
121 if (event_fd
== -1) {
125 aio_read_event
= tevent_add_fd(server_event_context(),
129 aio_linux_handle_completion
,
131 if (aio_read_event
== NULL
) {
135 if (io_queue_init(aio_pending_size
, &io_ctx
)) {
139 DEBUG(10,("init_aio_linux: initialized with up to %d events\n",
146 DEBUG(10,("init_aio_linux: initialization failed\n"));
149 TALLOC_FREE(io_recv_events
);
150 TALLOC_FREE(aio_read_event
);
151 if (event_fd
!= -1) {
155 memset(&io_ctx
, '\0', sizeof(io_ctx
));
159 /************************************************************************
160 Private data destructor.
161 ***********************************************************************/
163 static int pd_destructor(struct aio_private_data
*pd
)
165 DLIST_REMOVE(pd_list
, pd
);
169 /************************************************************************
170 Create and initialize a private data struct.
171 ***********************************************************************/
173 static struct aio_private_data
*create_private_data(TALLOC_CTX
*ctx
,
174 SMB_STRUCT_AIOCB
*aiocb
)
176 struct aio_private_data
*pd
= talloc_zero(ctx
, struct aio_private_data
);
180 pd
->event_iocb
= talloc_zero(pd
, struct iocb
);
181 pd
->requestid
= aio_linux_requestid
++;
184 pd
->ret_errno
= EINPROGRESS
;
185 talloc_set_destructor(pd
, pd_destructor
);
186 DLIST_ADD_END(pd_list
, pd
, struct aio_private_data
*);
190 /************************************************************************
191 Initiate an asynchronous pread call.
192 ***********************************************************************/
194 static int aio_linux_read(struct vfs_handle_struct
*handle
,
195 struct files_struct
*fsp
,
196 SMB_STRUCT_AIOCB
*aiocb
)
198 struct aio_extra
*aio_ex
= (struct aio_extra
*)aiocb
->aio_sigevent
.sigev_value
.sival_ptr
;
199 struct aio_private_data
*pd
= NULL
;
202 if (!init_aio_linux(handle
)) {
206 pd
= create_private_data(aio_ex
, aiocb
);
208 DEBUG(10, ("aio_linux_read: Could not create private data.\n"));
212 io_prep_pread(pd
->event_iocb
,
213 pd
->aiocb
->aio_fildes
,
214 discard_const(pd
->aiocb
->aio_buf
),
215 pd
->aiocb
->aio_nbytes
,
216 pd
->aiocb
->aio_offset
);
217 io_set_eventfd(pd
->event_iocb
, event_fd
);
218 /* Use the callback pointer as a private data ptr. */
219 io_set_callback(pd
->event_iocb
, (io_callback_t
)pd
);
221 ret
= io_submit(io_ctx
, 1, &pd
->event_iocb
);
227 DEBUG(10, ("aio_linux_read: requestid=%d read requested "
228 "of %llu bytes at offset %llu\n",
230 (unsigned long long)pd
->aiocb
->aio_nbytes
,
231 (unsigned long long)pd
->aiocb
->aio_offset
));
236 /************************************************************************
237 Initiate an asynchronous pwrite call.
238 ***********************************************************************/
240 static int aio_linux_write(struct vfs_handle_struct
*handle
,
241 struct files_struct
*fsp
,
242 SMB_STRUCT_AIOCB
*aiocb
)
244 struct aio_extra
*aio_ex
= (struct aio_extra
*)aiocb
->aio_sigevent
.sigev_value
.sival_ptr
;
245 struct aio_private_data
*pd
= NULL
;
248 if (!init_aio_linux(handle
)) {
252 pd
= create_private_data(aio_ex
, aiocb
);
254 DEBUG(10, ("aio_linux_write: Could not create private data.\n"));
258 io_prep_pwrite(pd
->event_iocb
,
259 pd
->aiocb
->aio_fildes
,
260 discard_const(pd
->aiocb
->aio_buf
),
261 pd
->aiocb
->aio_nbytes
,
262 pd
->aiocb
->aio_offset
);
263 io_set_eventfd(pd
->event_iocb
, event_fd
);
264 /* Use the callback pointer as a private data ptr. */
265 io_set_callback(pd
->event_iocb
, (io_callback_t
)pd
);
267 ret
= io_submit(io_ctx
, 1, &pd
->event_iocb
);
273 DEBUG(10, ("aio_linux_write: requestid=%d pwrite requested "
274 "of %llu bytes at offset %llu\n",
276 (unsigned long long)pd
->aiocb
->aio_nbytes
,
277 (unsigned long long)pd
->aiocb
->aio_offset
));
282 /************************************************************************
283 Save off the error / success conditions from the io_event.
284 Is idempotent (can be called multiple times given the same ioev).
285 ***********************************************************************/
287 static void aio_linux_setup_returns(struct io_event
*ioev
)
289 struct aio_private_data
*pd
= (struct aio_private_data
*)ioev
->data
;
293 pd
->ret_errno
= -ioev
->res
;
295 pd
->ret_size
= ioev
->res
;
300 /************************************************************************
301 Handle a single finished io.
302 ***********************************************************************/
304 static void aio_linux_handle_io_finished(struct io_event
*ioev
)
306 struct aio_extra
*aio_ex
= NULL
;
307 struct aio_private_data
*pd
= (struct aio_private_data
*)ioev
->data
;
309 aio_linux_setup_returns(ioev
);
311 aio_ex
= (struct aio_extra
*)pd
->aiocb
->aio_sigevent
.sigev_value
.sival_ptr
;
312 smbd_aio_complete_aio_ex(aio_ex
);
314 DEBUG(10,("aio_linux_handle_io_finished: requestid %d completed\n",
319 /************************************************************************
320 Callback when multiple IOs complete.
321 ***********************************************************************/
323 static void aio_linux_handle_completion(struct event_context
*event_ctx
,
324 struct fd_event
*event
,
328 uint64_t num_events
= 0;
330 DEBUG(10, ("aio_linux_handle_completion called with flags=%d\n",
333 if ((flags
& EVENT_FD_READ
) == 0) {
337 /* Read the number of events available. */
338 if (sys_read(event_fd
, &num_events
, sizeof(num_events
)) !=
339 sizeof(num_events
)) {
340 smb_panic("aio_linux_handle_completion: invalid read");
343 while (num_events
> 0) {
344 uint64_t events_to_read
= MIN(num_events
, aio_pending_size
);
352 ret
= io_getevents(io_ctx
,
354 (long)events_to_read
,
360 DEBUG(1, ("aio_linux_handle_completion: "
361 "io_getevents error %s\n",
367 DEBUG(10, ("aio_linux_handle_completion: "
368 "io_getevents returned 0\n"));
372 /* ret is positive. */
373 for (i
= 0; i
< ret
; i
++) {
374 aio_linux_handle_io_finished(&io_recv_events
[i
]);
381 /************************************************************************
382 Find the private data by aiocb.
383 ***********************************************************************/
385 static struct aio_private_data
*find_private_data_by_aiocb(SMB_STRUCT_AIOCB
*aiocb
)
387 struct aio_private_data
*pd
;
389 for (pd
= pd_list
; pd
!= NULL
; pd
= pd
->next
) {
390 if (pd
->aiocb
== aiocb
) {
398 /************************************************************************
399 Called to return the result of a completed AIO.
400 Should only be called if aio_error returns something other than EINPROGRESS.
402 Any other value - return from IO operation.
403 ***********************************************************************/
405 static ssize_t
aio_linux_return_fn(struct vfs_handle_struct
*handle
,
406 struct files_struct
*fsp
,
407 SMB_STRUCT_AIOCB
*aiocb
)
409 struct aio_private_data
*pd
= find_private_data_by_aiocb(aiocb
);
413 DEBUG(0, ("aio_linux_return_fn: returning EINVAL\n"));
424 if (pd
->ret_size
== -1) {
425 errno
= pd
->ret_errno
;
431 /************************************************************************
432 Called to check the result of an AIO.
434 EINPROGRESS - still in progress.
435 EINVAL - invalid aiocb.
436 ECANCELED - request was cancelled.
437 0 - request completed successfully.
438 Any other value - errno from IO operation.
439 ***********************************************************************/
441 static int aio_linux_error_fn(struct vfs_handle_struct
*handle
,
442 struct files_struct
*fsp
,
443 SMB_STRUCT_AIOCB
*aiocb
)
445 struct aio_private_data
*pd
= find_private_data_by_aiocb(aiocb
);
453 return pd
->ret_errno
;
456 /************************************************************************
457 Called to request the cancel of an AIO, or all of them on a specific
458 fsp if aiocb == NULL.
459 ***********************************************************************/
461 static int aio_linux_cancel(struct vfs_handle_struct
*handle
,
462 struct files_struct
*fsp
,
463 SMB_STRUCT_AIOCB
*aiocb
)
465 struct aio_private_data
*pd
= NULL
;
467 for (pd
= pd_list
; pd
!= NULL
; pd
= pd
->next
) {
468 if (pd
->aiocb
== NULL
) {
471 if (pd
->aiocb
->aio_fildes
!= fsp
->fh
->fd
) {
474 if ((aiocb
!= NULL
) && (pd
->aiocb
!= aiocb
)) {
479 * We let the kernel do its job, but we discard the result when
480 * it's finished. NB. Should I call io_cancel here ?
483 pd
->cancelled
= true;
489 /************************************************************************
490 Callback for a previously detected job completion deferred to the main
492 ***********************************************************************/
494 static void aio_linux_handle_immediate(struct tevent_context
*ctx
,
495 struct tevent_immediate
*im
,
498 struct io_event
*ioev
= (struct io_event
*)private_data
;
500 aio_linux_handle_io_finished(ioev
);
504 /************************************************************************
505 Private data struct used in suspend completion code.
506 ***********************************************************************/
508 struct suspend_private
{
511 const SMB_STRUCT_AIOCB
* const *aiocb_array
;
514 /************************************************************************
515 Handle a single finished io from suspend.
516 ***********************************************************************/
518 static void aio_linux_handle_suspend_io_finished(struct suspend_private
*sp
,
519 struct io_event
*ioev
)
521 struct aio_private_data
*pd
= (struct aio_private_data
*)ioev
->data
;
522 struct io_event
*new_ioev
= NULL
;
523 struct tevent_immediate
*im
= NULL
;
526 /* Is this a requestid with an aiocb we're interested in ? */
527 for (i
= 0; i
< sp
->num_entries
; i
++) {
528 if (sp
->aiocb_array
[i
] == pd
->aiocb
) {
531 * We don't call aio_linux_handle_io_finished()
532 * here, but only the function that sets up the
533 * return values. This allows
534 * aio_linux_handle_io_finished() to be successfully
535 * called from smbd/aio.c:wait_for_aio_completion()
536 * once we return from here with all io's done.
538 aio_linux_setup_returns(ioev
);
543 /* Jobid completed we weren't waiting for.
544 We must reshedule this as an immediate event
545 on the main event context. */
546 im
= tevent_create_immediate(NULL
);
548 exit_server_cleanly("aio_linux_handle_suspend_completion: no memory");
551 new_ioev
= (struct io_event
*)talloc_memdup(NULL
,
553 sizeof(struct io_event
));
555 exit_server_cleanly("aio_linux_handle_suspend_completion: no memory");
558 DEBUG(10,("aio_linux_handle_suspend_completion: "
559 "re-scheduling requestid %d\n",
562 tevent_schedule_immediate(im
,
563 server_event_context(),
564 aio_linux_handle_immediate
,
568 /************************************************************************
569 Callback when an IO completes from a suspend call.
570 ***********************************************************************/
572 static void aio_linux_handle_suspend_completion(struct event_context
*event_ctx
,
573 struct fd_event
*event
,
577 struct suspend_private
*sp
= (struct suspend_private
*)p
;
578 uint64_t remaining_events
= sp
->num_entries
- sp
->num_finished
;
579 uint64_t num_events
= 0;
581 DEBUG(10, ("aio_linux_handle_suspend_completion called with flags=%d\n",
584 if ((flags
& EVENT_FD_READ
) == 0) {
588 /* Read the number of events available. */
589 if (sys_read(event_fd
, &num_events
, sizeof(num_events
)) !=
590 sizeof(num_events
)) {
591 smb_panic("aio_linux_handle_completion: invalid read");
594 while (num_events
> 0) {
595 uint64_t events_to_read
= MIN(num_events
, remaining_events
);
603 ret
= io_getevents(io_ctx
,
605 (long)events_to_read
,
611 DEBUG(1, ("aio_linux_handle_suspend_completion: "
612 "io_getevents error %s\n",
618 DEBUG(10, ("aio_linux_handle_suspend_completion: "
619 "io_getevents returned 0\n"));
623 /* ret is positive. */
624 for (i
= 0; i
< ret
; i
++) {
625 aio_linux_handle_suspend_io_finished(sp
,
633 static void aio_linux_suspend_timed_out(struct tevent_context
*event_ctx
,
634 struct tevent_timer
*te
,
638 bool *timed_out
= (bool *)private_data
;
639 /* Remove this timed event handler. */
644 /************************************************************************
645 Called to request everything to stop until all IO is completed.
646 ***********************************************************************/
648 static int aio_linux_suspend(struct vfs_handle_struct
*handle
,
649 struct files_struct
*fsp
,
650 const SMB_STRUCT_AIOCB
* const aiocb_array
[],
652 const struct timespec
*timeout
)
654 struct event_context
*ev
= NULL
;
655 struct fd_event
*sock_event
= NULL
;
657 struct suspend_private sp
;
658 bool timed_out
= false;
659 TALLOC_CTX
*frame
= talloc_stackframe();
661 /* This is a blocking call, and has to use a sub-event loop. */
662 ev
= event_context_init(frame
);
669 struct timeval tv
= convert_timespec_to_timeval(*timeout
);
670 struct tevent_timer
*te
= tevent_add_timer(ev
,
672 timeval_current_ofs(tv
.tv_sec
,
674 aio_linux_suspend_timed_out
,
684 sp
.aiocb_array
= aiocb_array
;
687 sock_event
= tevent_add_fd(ev
,
691 aio_linux_handle_suspend_completion
,
693 if (sock_event
== NULL
) {
697 * We're going to cheat here. We know that smbd/aio.c
698 * only calls this when it's waiting for every single
699 * outstanding call to finish on a close, so just wait
700 * individually for each IO to complete. We don't care
701 * what order they finish - only that they all do. JRA.
703 while (sp
.num_entries
!= sp
.num_finished
) {
704 if (tevent_loop_once(ev
) == -1) {
722 static int aio_linux_connect(vfs_handle_struct
*handle
, const char *service
,
725 /*********************************************************************
726 * How many io_events to initialize ?
727 * 128 per process seems insane as a default until you realize that
728 * (a) Throttling is done in SMB2 via the crediting algorithm.
729 * (b) SMB1 clients are limited to max_mux (50) outstanding
730 * requests and Windows clients don't use this anyway.
731 * Essentially we want this to be unlimited unless smb.conf
733 *********************************************************************/
734 aio_pending_size
= lp_parm_int(
735 SNUM(handle
->conn
), "aio_linux", "aio num events", 128);
736 return SMB_VFS_NEXT_CONNECT(handle
, service
, user
);
739 static struct vfs_fn_pointers vfs_aio_linux_fns
= {
740 .connect_fn
= aio_linux_connect
,
741 .aio_read_fn
= aio_linux_read
,
742 .aio_write_fn
= aio_linux_write
,
743 .aio_return_fn
= aio_linux_return_fn
,
744 .aio_cancel_fn
= aio_linux_cancel
,
745 .aio_error_fn
= aio_linux_error_fn
,
746 .aio_suspend_fn
= aio_linux_suspend
,
749 NTSTATUS
vfs_aio_linux_init(void)
751 return smb_register_vfs(SMB_VFS_INTERFACE_VERSION
,
752 "aio_linux", &vfs_aio_linux_fns
);