2 * Simulate Posix AIO using Linux kernel AIO.
4 * Copyright (C) Jeremy Allison 2012
5 * Copyright (C) Volker Lendecke 2012
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 3 of the License, or
10 * (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 #include "system/filesys.h"
24 #include "smbd/smbd.h"
25 #include "smbd/globals.h"
26 #include "lib/util/tevent_unix.h"
27 #include <sys/eventfd.h>
30 static int event_fd
= -1;
31 static io_context_t io_ctx
;
32 static struct fd_event
*aio_read_event
;
34 static unsigned num_busy
;
36 static void aio_linux_done(struct event_context
*event_ctx
,
37 struct fd_event
*event
,
38 uint16 flags
, void *private_data
);
40 /************************************************************************
41 Housekeeping. Cleanup if no activity for 30 seconds.
42 ***********************************************************************/
44 static void aio_linux_housekeeping(struct tevent_context
*event_ctx
,
45 struct tevent_timer
*te
,
49 /* Remove this timed event handler. */
52 if ((num_busy
!= 0) || used
) {
55 /* Still busy. Look again in 30 seconds. */
56 (void)tevent_add_timer(event_ctx
,
58 timeval_current_ofs(30, 0),
59 aio_linux_housekeeping
,
64 /* No activity for 30 seconds. Close out kernel resources. */
65 io_queue_release(io_ctx
);
66 memset(&io_ctx
, '\0', sizeof(io_ctx
));
73 TALLOC_FREE(aio_read_event
);
76 /************************************************************************
77 Ensure event fd and aio context are initialized.
78 ***********************************************************************/
80 static bool init_aio_linux(struct vfs_handle_struct
*handle
)
82 struct tevent_timer
*te
= NULL
;
85 /* Already initialized. */
89 /* Schedule a shutdown event for 30 seconds from now. */
90 te
= tevent_add_timer(handle
->conn
->sconn
->ev_ctx
,
92 timeval_current_ofs(30, 0),
93 aio_linux_housekeeping
,
100 event_fd
= eventfd(0, EFD_NONBLOCK
| EFD_CLOEXEC
);
101 if (event_fd
== -1) {
105 aio_read_event
= tevent_add_fd(server_event_context(),
111 if (aio_read_event
== NULL
) {
115 if (io_queue_init(aio_pending_size
, &io_ctx
)) {
119 DEBUG(10,("init_aio_linux: initialized with up to %d events\n",
126 DEBUG(10,("init_aio_linux: initialization failed\n"));
129 TALLOC_FREE(aio_read_event
);
130 if (event_fd
!= -1) {
134 memset(&io_ctx
, '\0', sizeof(io_ctx
));
138 struct aio_linux_state
{
139 struct iocb event_iocb
;
144 static struct tevent_req
*aio_linux_pread_send(
145 struct vfs_handle_struct
*handle
, TALLOC_CTX
*mem_ctx
,
146 struct tevent_context
*ev
, struct files_struct
*fsp
,
147 void *data
, size_t n
, off_t offset
)
149 struct tevent_req
*req
;
150 struct aio_linux_state
*state
;
154 req
= tevent_req_create(mem_ctx
, &state
, struct aio_linux_state
);
158 if (!init_aio_linux(handle
)) {
159 tevent_req_error(req
, EIO
);
160 return tevent_req_post(req
, ev
);
163 io_prep_pread(&state
->event_iocb
, fsp
->fh
->fd
, data
, n
, offset
);
164 io_set_eventfd(&state
->event_iocb
, event_fd
);
165 state
->event_iocb
.data
= req
;
167 piocb
= &state
->event_iocb
;
169 ret
= io_submit(io_ctx
, 1, &piocb
);
171 tevent_req_error(req
, -ret
);
172 return tevent_req_post(req
, ev
);
179 static struct tevent_req
*aio_linux_pwrite_send(
180 struct vfs_handle_struct
*handle
, TALLOC_CTX
*mem_ctx
,
181 struct tevent_context
*ev
, struct files_struct
*fsp
,
182 const void *data
, size_t n
, off_t offset
)
184 struct tevent_req
*req
;
185 struct aio_linux_state
*state
;
189 req
= tevent_req_create(mem_ctx
, &state
, struct aio_linux_state
);
193 if (!init_aio_linux(handle
)) {
194 tevent_req_error(req
, EIO
);
195 return tevent_req_post(req
, ev
);
198 io_prep_pwrite(&state
->event_iocb
, fsp
->fh
->fd
, discard_const(data
),
200 io_set_eventfd(&state
->event_iocb
, event_fd
);
201 state
->event_iocb
.data
= req
;
203 piocb
= &state
->event_iocb
;
205 ret
= io_submit(io_ctx
, 1, &piocb
);
207 tevent_req_error(req
, -ret
);
208 return tevent_req_post(req
, ev
);
215 static struct tevent_req
*aio_linux_fsync_send(
216 struct vfs_handle_struct
*handle
, TALLOC_CTX
*mem_ctx
,
217 struct tevent_context
*ev
, struct files_struct
*fsp
)
219 struct tevent_req
*req
;
220 struct aio_linux_state
*state
;
224 req
= tevent_req_create(mem_ctx
, &state
, struct aio_linux_state
);
228 if (!init_aio_linux(handle
)) {
229 tevent_req_error(req
, EIO
);
230 return tevent_req_post(req
, ev
);
233 io_prep_fsync(&state
->event_iocb
, fsp
->fh
->fd
);
234 io_set_eventfd(&state
->event_iocb
, event_fd
);
235 state
->event_iocb
.data
= req
;
237 piocb
= &state
->event_iocb
;
239 ret
= io_submit(io_ctx
, 1, &piocb
);
241 tevent_req_error(req
, -ret
);
242 return tevent_req_post(req
, ev
);
249 static void aio_linux_done(struct event_context
*event_ctx
,
250 struct fd_event
*event
,
251 uint16 flags
, void *private_data
)
253 uint64_t num_events
= 0;
255 DEBUG(10, ("aio_linux_done called with flags=%d\n",
258 /* Read the number of events available. */
259 if (sys_read(event_fd
, &num_events
, sizeof(num_events
)) !=
260 sizeof(num_events
)) {
261 smb_panic("aio_linux_handle_completion: invalid read");
264 while (num_events
> 0) {
265 struct timespec ts
= { 0, };
266 struct io_event finished
;
267 struct tevent_req
*req
;
268 struct aio_linux_state
*state
;
271 ret
= io_getevents(io_ctx
, 1, 1, &finished
, &ts
);
273 DEBUG(1, ("aio_linux_done: io_getevents returned %s\n",
278 DEBUG(10, ("aio_linux_done: io_getvents returned "
285 req
= talloc_get_type_abort(finished
.data
,
287 state
= tevent_req_data(req
, struct aio_linux_state
);
289 if (finished
.res
< 0) {
291 state
->err
= -finished
.res
;
293 state
->ret
= finished
.res
;
296 tevent_req_done(req
);
301 static ssize_t
aio_linux_recv(struct tevent_req
*req
, int *err
)
303 struct aio_linux_state
*state
= tevent_req_data(
304 req
, struct aio_linux_state
);
306 if (tevent_req_is_unix_error(req
, err
)) {
309 if (state
->ret
== -1) {
315 static int aio_linux_int_recv(struct tevent_req
*req
, int *err
)
318 * Use implicit conversion ssize_t->int
320 return aio_linux_recv(req
, err
);
323 static int aio_linux_connect(vfs_handle_struct
*handle
, const char *service
,
326 /*********************************************************************
327 * How many io_events to initialize ?
328 * 128 per process seems insane as a default until you realize that
329 * (a) Throttling is done in SMB2 via the crediting algorithm.
330 * (b) SMB1 clients are limited to max_mux (50) outstanding
331 * requests and Windows clients don't use this anyway.
332 * Essentially we want this to be unlimited unless smb.conf
334 *********************************************************************/
335 aio_pending_size
= lp_parm_int(
336 SNUM(handle
->conn
), "aio_linux", "aio num events", 128);
337 return SMB_VFS_NEXT_CONNECT(handle
, service
, user
);
340 static struct vfs_fn_pointers vfs_aio_linux_fns
= {
341 .connect_fn
= aio_linux_connect
,
342 .pread_send_fn
= aio_linux_pread_send
,
343 .pread_recv_fn
= aio_linux_recv
,
344 .pwrite_send_fn
= aio_linux_pwrite_send
,
345 .pwrite_recv_fn
= aio_linux_recv
,
346 .fsync_send_fn
= aio_linux_fsync_send
,
347 .fsync_recv_fn
= aio_linux_int_recv
,
350 NTSTATUS
vfs_aio_linux_init(void)
352 return smb_register_vfs(SMB_VFS_INTERFACE_VERSION
,
353 "aio_linux", &vfs_aio_linux_fns
);