2 * Unix SMB/CIFS implementation.
3 * Support for OneFS kernel oplocks
5 * Copyright (C) Volker Lendecke 2007
6 * Copyright (C) Tim Prouty, 2009
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 3 of the License, or
11 * (at your option) any later version.
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, see <http://www.gnu.org/licenses/>.
22 #define DBGC_CLASS DBGC_LOCKING
27 #include "oplock_onefs.h"
28 #include "smbd/smbd.h"
29 #include "smbd/globals.h"
31 #include <ifs/ifs_syscalls.h>
32 #include <isi_ecs/isi_ecs_oplocks.h>
35 struct onefs_oplocks_context
{
36 struct kernel_oplocks
*ctx
;
37 const struct oplocks_event_ops
*onefs_ops
;
39 struct fd_event
*read_fde
;
42 enum onefs_callback_state
{
44 ONEFS_WAITING_FOR_OPLOCK
47 struct onefs_callback_record
{
48 struct onefs_callback_record
*prev
, *next
;
50 enum onefs_callback_state state
;
52 files_struct
*fsp
; /* ONEFS_OPEN_FILE */
53 uint64_t mid
; /* ONEFS_WAITING_FOR_OPLOCK */
58 * Internal list of files (along with additional state) that have outstanding
59 * oplocks or requests for oplocks.
61 struct onefs_callback_record
*callback_recs
;
64 * Convert a onefs_callback_record to a debug string using the dbg_ctx().
66 const char *onefs_cb_record_str_dbg(const struct onefs_callback_record
*r
)
71 result
= talloc_strdup(talloc_tos(), "NULL callback record");
77 result
= talloc_asprintf(talloc_tos(), "cb record %llu for "
79 fsp_str_dbg(r
->data
.fsp
));
80 case ONEFS_WAITING_FOR_OPLOCK
:
81 result
= talloc_asprintf(talloc_tos(), "cb record %llu for "
82 "pending mid %llu", r
->id
,
83 (unsigned long long)r
->data
.mid
);
86 result
= talloc_asprintf(talloc_tos(), "cb record %llu unknown "
87 "state %d", r
->id
, r
->state
);
95 * Traverse the list of onefs_callback_records and print all entries.
97 static void debug_cb_records(const char *fn
)
99 struct onefs_callback_record
*rec
;
104 DEBUG(10, ("cb records (%s):\n", fn
));
106 for (rec
= callback_recs
; rec
; rec
= rec
->next
) {
107 DEBUGADD(10, ("%s\n", onefs_cb_record_str_dbg(rec
)));
112 * Find a callback record in the list of outstanding oplock operations.
114 * Once n ifs_createfile requests an oplock on a file, the kernel communicates
115 * with samba via the oplock event channel by sending events that reference an
116 * id. This function maps that id to the onefs_callback_record that was
117 * created for it during the initial setup on open (onefs_oplock_wait_record).
118 * When a matching id is found in the onefs_callback_record list, the
119 * callback_type is checked to make sure the record is in in the correct
122 static struct onefs_callback_record
*onefs_find_cb(uint64_t id
,
123 enum onefs_callback_state expected_state
)
125 struct onefs_callback_record
*rec
;
127 debug_cb_records("onefs_find_cb");
129 for (rec
= callback_recs
; rec
; rec
= rec
->next
) {
131 DEBUG(10, ("found %s\n",
132 onefs_cb_record_str_dbg(rec
)));
138 DEBUG(5, ("Could not find callback record for id %llu\n", id
));
142 if (rec
->state
!= expected_state
) {
143 DEBUG(0, ("Expected cb type %d, got %s", expected_state
,
144 onefs_cb_record_str_dbg(rec
)));
153 * Remove and free a callback record from the callback record list.
155 void destroy_onefs_callback_record(uint64_t id
)
157 struct onefs_callback_record
*rec
;
159 debug_cb_records("destroy_onefs_callback_record");
162 DEBUG(10, ("destroy_onefs_callback_record: Nothing to "
167 for (rec
= callback_recs
; rec
; rec
= rec
->next
) {
169 DLIST_REMOVE(callback_recs
, rec
);
171 DEBUG(10, ("removed cb rec %llu\n", id
));
176 DEBUG(0, ("Could not find cb rec %llu to delete", id
));
181 * Initialize a callback record and add it to the list of outstanding callback
184 * This is called in the open path before ifs_createfile so an id can be
185 * passed in. Each callback record can be in one of two states:
187 * 1. WAITING_FOR_OPLOCK: This is the initial state for all callback
188 * records. If ifs_createfile can be completed syncronously without needing
189 * to break any level I oplocks, the state is transitioned to OPEN_FILE.
190 * Otherwise ifs_createfile will finish asynchronously and the open is
191 * deferred. When the necessary level I opocks have been broken, and the
192 * open can be done, an event is sent by the kernel on the oplock event
193 * channel, which is handled by semlock_available_handler. At this point
194 * the deferred open is retried. Unless a level I oplock was acquired by
195 * another client, ifs_createfile will now complete synchronously.
197 * 2. OPEN_FILE: Once ifs_createfile completes, the callback record is
198 * transitioned to this state via onefs_set_oplock_callback.
200 uint64_t onefs_oplock_wait_record(uint64_t mid
)
202 struct onefs_callback_record
*result
;
203 static uint64_t id_generator
= 0;
205 if (!(result
= SMB_MALLOC_P(struct onefs_callback_record
))) {
206 DEBUG(0, ("talloc failed\n"));
210 memset(result
, '\0', sizeof(result
));
213 if (id_generator
== 0) {
214 /* Wow, that's a long-running smbd... */
218 result
->id
= id_generator
;
220 result
->state
= ONEFS_WAITING_FOR_OPLOCK
;
221 result
->data
.mid
= mid
;
222 DLIST_ADD(callback_recs
, result
);
224 DEBUG(10, ("New cb rec %llu created\n", result
->id
));
230 * Transition the callback record state to OPEN_FILE.
232 * This is called after the file is opened and an fsp struct has been
233 * allocated. The mid is dropped in favor of storing the fsp.
235 void onefs_set_oplock_callback(uint64_t id
, files_struct
*fsp
)
237 struct onefs_callback_record
*cb
;
240 DEBUG(10, ("onefs_set_oplock_callback called for cb rec %llu\n", id
));
242 if (!(cb
= onefs_find_cb(id
, ONEFS_WAITING_FOR_OPLOCK
))) {
243 if (asprintf(&msg
, "Got invalid callback %lld\n", id
) != -1) {
246 smb_panic("Got invalid callback id\n");
252 if (open_was_deferred(cb
->data
.mid
)) {
253 if (asprintf(&msg
, "Trying to upgrade callback for deferred "
254 "open mid=%llu\n", (unsigned long long)cb
->data
.mid
) != -1) {
257 smb_panic("Trying to upgrade callback for deferred open "
261 cb
->state
= ONEFS_OPEN_FILE
;
266 * Using a callback record, initialize a share mode entry to pass to
267 * share_mode_entry_to_message to send samba IPC messages.
269 static void init_share_mode_entry(struct share_mode_entry
*sme
,
270 struct onefs_callback_record
*cb
,
275 sme
->pid
= procid_self();
276 sme
->op_type
= op_type
;
277 sme
->id
= cb
->data
.fsp
->file_id
;
278 sme
->share_file_id
= cb
->data
.fsp
->fh
->gen_id
;
282 * Callback when a break-to-none event is received from the kernel.
284 * On OneFS level 1 oplocks are always broken to level 2 first, therefore an
285 * async level 2 break message is always sent when breaking to none. The
286 * downside of this is that OneFS currently has no way to express breaking
287 * directly from level 1 to none.
289 static void oplock_break_to_none_handler(uint64_t id
)
291 struct onefs_callback_record
*cb
;
292 struct share_mode_entry sme
;
293 char msg
[MSG_SMB_SHARE_MODE_ENTRY_SIZE
];
295 DEBUG(10, ("oplock_break_to_none_handler called for id %llu\n", id
));
297 if (!(cb
= onefs_find_cb(id
, ONEFS_OPEN_FILE
))) {
298 DEBUG(3, ("oplock_break_to_none_handler: could not find "
299 "callback id %llu\n", id
));
303 DEBUG(10, ("oplock_break_to_none_handler called for file %s\n",
304 fsp_str_dbg(cb
->data
.fsp
)));
306 init_share_mode_entry(&sme
, cb
, FORCE_OPLOCK_BREAK_TO_NONE
);
307 share_mode_entry_to_message(msg
, &sme
);
308 messaging_send_buf(smbd_messaging_context(),
310 MSG_SMB_ASYNC_LEVEL2_BREAK
,
312 MSG_SMB_SHARE_MODE_ENTRY_SIZE
);
315 * We could still receive an OPLOCK_REVOKED message, so keep the
316 * oplock_callback_id around.
321 * Callback when a break-to-level2 event is received from the kernel.
323 * Breaks from level 1 to level 2.
325 static void oplock_break_to_level_two_handler(uint64_t id
)
327 struct onefs_callback_record
*cb
;
328 struct share_mode_entry sme
;
329 char msg
[MSG_SMB_SHARE_MODE_ENTRY_SIZE
];
331 DEBUG(10, ("oplock_break_to_level_two_handler called for id %llu\n",
334 if (!(cb
= onefs_find_cb(id
, ONEFS_OPEN_FILE
))) {
335 DEBUG(3, ("oplock_break_to_level_two_handler: could not find "
336 "callback id %llu\n", id
));
340 DEBUG(10, ("oplock_break_to_level_two_handler called for file %s\n",
341 fsp_str_dbg(cb
->data
.fsp
)));
343 init_share_mode_entry(&sme
, cb
, LEVEL_II_OPLOCK
);
344 share_mode_entry_to_message(msg
, &sme
);
345 messaging_send_buf(smbd_messaging_context(),
347 MSG_SMB_BREAK_REQUEST
,
349 MSG_SMB_SHARE_MODE_ENTRY_SIZE
);
352 * We could still receive an OPLOCK_REVOKED or OPLOCK_BREAK_TO_NONE
353 * message, so keep the oplock_callback_id around.
358 * Revoke an oplock from an unresponsive client.
360 * The kernel will send this message when it times out waiting for a level 1
361 * oplock break to be acknowledged by the client. The oplock is then
362 * immediately removed.
364 static void oplock_revoked_handler(uint64_t id
)
366 struct onefs_callback_record
*cb
;
367 files_struct
*fsp
= NULL
;
369 DEBUG(10, ("oplock_revoked_handler called for id %llu\n", id
));
371 if (!(cb
= onefs_find_cb(id
, ONEFS_OPEN_FILE
))) {
372 DEBUG(3, ("oplock_revoked_handler: could not find "
373 "callback id %llu\n", id
));
379 SMB_ASSERT(fsp
->oplock_timeout
== NULL
);
381 DEBUG(0,("Level 1 oplock break failed for file %s. Forcefully "
382 "revoking oplock\n", fsp_str_dbg(fsp
)));
387 * cb record is cleaned up in fsp ext data destructor on close, so
388 * leave it in the list.
393 * Asynchronous ifs_createfile callback
395 * If ifs_createfile had to asynchronously break any oplocks, this function is
396 * called when the kernel sends an event that the open can be retried.
398 static void semlock_available_handler(uint64_t id
)
400 struct onefs_callback_record
*cb
;
402 DEBUG(10, ("semlock_available_handler called: %llu\n", id
));
404 if (!(cb
= onefs_find_cb(id
, ONEFS_WAITING_FOR_OPLOCK
))) {
405 DEBUG(5, ("semlock_available_handler: Did not find callback "
410 DEBUG(10, ("Got semlock available for mid %llu\n",
411 (unsigned long long)cb
->data
.mid
));
414 if (!(open_was_deferred(cb
->data
.mid
))) {
416 if (asprintf(&msg
, "Semlock available on an open that wasn't "
418 onefs_cb_record_str_dbg(cb
)) != -1) {
421 smb_panic("Semlock available on an open that wasn't "
425 schedule_deferred_open_smb_message(cb
->data
.mid
);
427 /* Cleanup the callback record since the open will be retried. */
428 destroy_onefs_callback_record(id
);
434 * Asynchronous ifs_createfile failure callback
436 * If ifs_createfile had to asynchronously break any oplocks, but an error was
437 * encountered in the kernel, the open will be retried with the state->failed
438 * set to true. This will prompt the open path to send an INTERNAL_ERROR
439 * error message to the client.
441 static void semlock_async_failure_handler(uint64_t id
)
443 struct onefs_callback_record
*cb
;
444 struct deferred_open_record
*state
;
446 DEBUG(1, ("semlock_async_failure_handler called: %llu\n", id
));
448 if (!(cb
= onefs_find_cb(id
, ONEFS_WAITING_FOR_OPLOCK
))) {
449 DEBUG(5, ("semlock_async_failure_handler: Did not find callback "
454 DEBUG(1, ("Got semlock_async_failure message for mid %llu\n",
455 (unsigned long long)cb
->data
.mid
));
458 if (!(open_was_deferred(cb
->data
.mid
))) {
460 if (asprintf(&msg
, "Semlock failure on an open that wasn't "
462 onefs_cb_record_str_dbg(cb
)) != -1) {
465 smb_panic("Semlock failure on an open that wasn't deferred\n");
468 /* Find the actual deferred open record. */
469 if (!get_open_deferred_message_state(cb
->data
.mid
, NULL
, &state
)) {
470 DEBUG(0, ("Could not find deferred request for "
471 "mid %d\n", cb
->data
.mid
));
472 destroy_onefs_callback_record(id
);
476 /* Update to failed so the client can be notified on retried open. */
477 state
->failed
= true;
479 /* Schedule deferred open for immediate retry. */
480 schedule_deferred_open_smb_message(cb
->data
.mid
);
482 /* Cleanup the callback record here since the open will be retried. */
483 destroy_onefs_callback_record(id
);
489 * OneFS acquires all oplocks via ifs_createfile, so this is a no-op.
491 static bool onefs_set_kernel_oplock(struct kernel_oplocks
*_ctx
,
492 files_struct
*fsp
, int oplock_type
) {
497 * Release the kernel oplock.
499 static void onefs_release_kernel_oplock(struct kernel_oplocks
*_ctx
,
500 files_struct
*fsp
, int oplock_type
)
502 enum oplock_type oplock
= onefs_samba_oplock_to_oplock(oplock_type
);
504 DEBUG(10, ("onefs_release_kernel_oplock: Releasing %s to type %s\n",
505 fsp_str_dbg(fsp
), onefs_oplock_str(oplock
)));
507 if (fsp
->fh
->fd
== -1) {
508 DEBUG(1, ("no fd\n"));
512 /* Downgrade oplock to either SHARED or NONE. */
513 if (ifs_oplock_downgrade(fsp
->fh
->fd
, oplock
)) {
514 DEBUG(1,("ifs_oplock_downgrade failed: %s\n",
520 * Wrap ifs_semlock_write so it is only called on operations that aren't
521 * already contended in the kernel.
523 static void onefs_semlock_write(int fd
, enum level2_contention_type type
,
524 enum semlock_operation semlock_op
)
529 case LEVEL2_CONTEND_ALLOC_GROW
:
530 case LEVEL2_CONTEND_POSIX_BRL
:
531 DEBUG(10, ("Taking %d write semlock for cmd %d on fd: %d\n",
532 semlock_op
, type
, fd
));
533 ret
= ifs_semlock_write(fd
, semlock_op
);
535 DEBUG(0,("ifs_semlock_write failed taking %d write "
536 "semlock for cmd %d on fd: %d: %s",
537 semlock_op
, type
, fd
, strerror(errno
)));
541 DEBUG(10, ("Skipping write semlock for cmd %d on fd: %d\n",
547 * Contend level 2 oplocks in the kernel and smbd.
549 * Taking a write semlock will contend all level 2 oplocks in all smbds across
550 * the cluster except the fsp's own level 2 oplock. This lack of
551 * self-contention is a limitation of the current OneFS kernel oplocks
552 * implementation. Luckily it is easy to contend our own level 2 oplock by
553 * checking the the fsp's oplock_type. If it's a level2, send a break message
554 * to the client and remove the oplock.
556 static void onefs_contend_level2_oplocks_begin(files_struct
*fsp
,
557 enum level2_contention_type type
)
559 /* Take care of level 2 kernel contention. */
560 onefs_semlock_write(fsp
->fh
->fd
, type
, SEMLOCK_LOCK
);
562 /* Take care of level 2 self contention. */
563 if (LEVEL_II_OPLOCK_TYPE(fsp
->oplock_type
))
564 break_level2_to_none_async(fsp
);
568 * Unlock the write semlock when the level 2 contending operation ends.
570 static void onefs_contend_level2_oplocks_end(files_struct
*fsp
,
571 enum level2_contention_type type
)
573 /* Take care of level 2 kernel contention. */
574 onefs_semlock_write(fsp
->fh
->fd
, type
, SEMLOCK_UNLOCK
);
578 * Return string value of onefs oplock types.
580 const char *onefs_oplock_str(enum oplock_type onefs_oplock_type
)
582 switch (onefs_oplock_type
) {
584 return "OPLOCK_NONE";
585 case OPLOCK_EXCLUSIVE
:
586 return "OPLOCK_EXCLUSIVE";
588 return "OPLOCK_BATCH";
590 return "OPLOCK_SHARED";
598 * Convert from onefs to samba oplock.
600 int onefs_oplock_to_samba_oplock(enum oplock_type onefs_oplock
)
602 switch (onefs_oplock
) {
605 case OPLOCK_EXCLUSIVE
:
606 return EXCLUSIVE_OPLOCK
;
610 return LEVEL_II_OPLOCK
;
612 DEBUG(0, ("unknown oplock type %d found\n", onefs_oplock
));
619 * Convert from samba to onefs oplock.
621 enum oplock_type
onefs_samba_oplock_to_oplock(int samba_oplock_type
)
623 if (BATCH_OPLOCK_TYPE(samba_oplock_type
)) return OPLOCK_BATCH
;
624 if (EXCLUSIVE_OPLOCK_TYPE(samba_oplock_type
)) return OPLOCK_EXCLUSIVE
;
625 if (LEVEL_II_OPLOCK_TYPE(samba_oplock_type
)) return OPLOCK_SHARED
;
630 * Oplock event handler.
632 * Call into the event system dispatcher to handle each event.
634 static void onefs_oplocks_read_fde_handler(struct event_context
*ev
,
635 struct fd_event
*fde
,
639 struct onefs_oplocks_context
*ctx
=
640 talloc_get_type(private_data
, struct onefs_oplocks_context
);
642 if (oplocks_event_dispatcher(ctx
->onefs_ops
)) {
643 DEBUG(0, ("oplocks_event_dispatcher failed: %s\n",
649 * Setup kernel oplocks
651 static const struct kernel_oplocks_ops onefs_koplocks_ops
= {
652 .set_oplock
= onefs_set_kernel_oplock
,
653 .release_oplock
= onefs_release_kernel_oplock
,
654 .contend_level2_oplocks_begin
= onefs_contend_level2_oplocks_begin
,
655 .contend_level2_oplocks_end
= onefs_contend_level2_oplocks_end
,
658 static const struct oplocks_event_ops onefs_dispatch_ops
= {
659 .oplock_break_to_none
= oplock_break_to_none_handler
,
660 .oplock_break_to_level_two
= oplock_break_to_level_two_handler
,
661 .oplock_revoked
= oplock_revoked_handler
,
662 .semlock_available
= semlock_available_handler
,
663 .semlock_async_failure
= semlock_async_failure_handler
,
666 struct kernel_oplocks
*onefs_init_kernel_oplocks(TALLOC_CTX
*mem_ctx
)
668 struct kernel_oplocks
*_ctx
= NULL
;
669 struct onefs_oplocks_context
*ctx
= NULL
;
670 struct procoptions po
= PROCOPTIONS_INIT
;
672 DEBUG(10, ("onefs_init_kernel_oplocks called\n"));
674 /* Set the non-blocking proc flag */
675 po
.po_flags_on
|= P_NON_BLOCKING_SEMLOCK
;
676 if (setprocoptions(&po
) != 0) {
677 DEBUG(0, ("setprocoptions failed: %s.\n", strerror(errno
)));
681 /* Setup the oplock contexts */
682 _ctx
= talloc_zero(mem_ctx
, struct kernel_oplocks
);
687 ctx
= talloc_zero(_ctx
, struct onefs_oplocks_context
);
692 _ctx
->ops
= &onefs_koplocks_ops
;
693 _ctx
->flags
= (KOPLOCKS_LEVEL2_SUPPORTED
|
694 KOPLOCKS_DEFERRED_OPEN_NOTIFICATION
|
695 KOPLOCKS_TIMEOUT_NOTIFICATION
|
696 KOPLOCKS_OPLOCK_BROKEN_NOTIFICATION
);
697 _ctx
->private_data
= ctx
;
699 ctx
->onefs_ops
= &onefs_dispatch_ops
;
701 /* Register an kernel event channel for oplocks */
702 ctx
->onefs_event_fd
= oplocks_event_register();
703 if (ctx
->onefs_event_fd
== -1) {
704 DEBUG(0, ("oplocks_event_register failed: %s\n",
709 DEBUG(10, ("oplock event_fd = %d\n", ctx
->onefs_event_fd
));
711 /* Register the oplock event_fd with samba's event system */
712 ctx
->read_fde
= event_add_fd(server_event_context(),
716 onefs_oplocks_read_fde_handler
,
726 void oplock_onefs_dummy(void);
727 void oplock_onefs_dummy(void) {}
728 #endif /* HAVE_ONEFS */