libcli/smb: move smb2cli_ioctl.c from source3 to the toplevel
[Samba/vl.git] / source3 / smbd / oplock_onefs.c
blob615747e92cb5c640b34ce556c58205c4a27f6644
1 /*
2 * Unix SMB/CIFS implementation.
3 * Support for OneFS kernel oplocks
5 * Copyright (C) Volker Lendecke 2007
6 * Copyright (C) Tim Prouty, 2009
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 3 of the License, or
11 * (at your option) any later version.
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, see <http://www.gnu.org/licenses/>.
22 #define DBGC_CLASS DBGC_LOCKING
24 #include "includes.h"
26 #if HAVE_ONEFS
27 #include "oplock_onefs.h"
28 #include "smbd/smbd.h"
29 #include "smbd/globals.h"
31 #include <ifs/ifs_syscalls.h>
32 #include <isi_ecs/isi_ecs_oplocks.h>
33 #include <sys/proc.h>
35 struct onefs_oplocks_context {
36 struct kernel_oplocks *ctx;
37 struct smbd_server_connection *sconn;
38 const struct oplocks_event_ops *onefs_ops;
39 int onefs_event_fd;
40 struct fd_event *read_fde;
43 enum onefs_callback_state {
44 ONEFS_OPEN_FILE,
45 ONEFS_WAITING_FOR_OPLOCK
48 struct onefs_callback_record {
49 struct onefs_callback_record *prev, *next;
50 struct smbd_server_connection *sconn;
51 uint64_t id;
52 enum onefs_callback_state state;
53 union {
54 files_struct *fsp; /* ONEFS_OPEN_FILE */
55 uint64_t mid; /* ONEFS_WAITING_FOR_OPLOCK */
56 } data;
59 /**
60 * Internal list of files (along with additional state) that have outstanding
61 * oplocks or requests for oplocks.
63 struct onefs_callback_record *callback_recs;
65 /**
66 * Convert a onefs_callback_record to a debug string using the dbg_ctx().
68 const char *onefs_cb_record_str_dbg(const struct onefs_callback_record *r)
70 char *result;
72 if (r == NULL) {
73 result = talloc_strdup(talloc_tos(), "NULL callback record");
74 return result;
77 switch (r->state) {
78 case ONEFS_OPEN_FILE:
79 result = talloc_asprintf(talloc_tos(), "cb record %llu for "
80 "file %s", r->id,
81 fsp_str_dbg(r->data.fsp));
82 case ONEFS_WAITING_FOR_OPLOCK:
83 result = talloc_asprintf(talloc_tos(), "cb record %llu for "
84 "pending mid %llu", r->id,
85 (unsigned long long)r->data.mid);
86 break;
87 default:
88 result = talloc_asprintf(talloc_tos(), "cb record %llu unknown "
89 "state %d", r->id, r->state);
90 break;
93 return result;
96 /**
97 * Traverse the list of onefs_callback_records and print all entries.
99 static void debug_cb_records(const char *fn)
101 struct onefs_callback_record *rec;
103 if (DEBUGLEVEL < 10)
104 return;
106 DEBUG(10, ("cb records (%s):\n", fn));
108 for (rec = callback_recs; rec; rec = rec->next) {
109 DEBUGADD(10, ("%s\n", onefs_cb_record_str_dbg(rec)));
114 * Find a callback record in the list of outstanding oplock operations.
116 * Once n ifs_createfile requests an oplock on a file, the kernel communicates
117 * with samba via the oplock event channel by sending events that reference an
118 * id. This function maps that id to the onefs_callback_record that was
119 * created for it during the initial setup on open (onefs_oplock_wait_record).
120 * When a matching id is found in the onefs_callback_record list, the
121 * callback_type is checked to make sure the record is in in the correct
122 * state.
124 static struct onefs_callback_record *onefs_find_cb(uint64_t id,
125 enum onefs_callback_state expected_state)
127 struct onefs_callback_record *rec;
129 debug_cb_records("onefs_find_cb");
131 for (rec = callback_recs; rec; rec = rec->next) {
132 if (rec->id == id) {
133 DEBUG(10, ("found %s\n",
134 onefs_cb_record_str_dbg(rec)));
135 break;
139 if (rec == NULL) {
140 DEBUG(5, ("Could not find callback record for id %llu\n", id));
141 return NULL;
144 if (rec->state != expected_state) {
145 DEBUG(0, ("Expected cb type %d, got %s", expected_state,
146 onefs_cb_record_str_dbg(rec)));
147 SMB_ASSERT(0);
148 return NULL;
151 return rec;
155 * Remove and free a callback record from the callback record list.
157 void destroy_onefs_callback_record(uint64_t id)
159 struct onefs_callback_record *rec;
161 debug_cb_records("destroy_onefs_callback_record");
163 if (id == 0) {
164 DEBUG(10, ("destroy_onefs_callback_record: Nothing to "
165 "destroy\n"));
166 return;
169 for (rec = callback_recs; rec; rec = rec->next) {
170 if (rec->id == id) {
171 DLIST_REMOVE(callback_recs, rec);
172 SAFE_FREE(rec);
173 DEBUG(10, ("removed cb rec %llu\n", id));
174 return;
178 DEBUG(0, ("Could not find cb rec %llu to delete", id));
179 SMB_ASSERT(0);
183 * Initialize a callback record and add it to the list of outstanding callback
184 * records.
186 * This is called in the open path before ifs_createfile so an id can be
187 * passed in. Each callback record can be in one of two states:
189 * 1. WAITING_FOR_OPLOCK: This is the initial state for all callback
190 * records. If ifs_createfile can be completed syncronously without needing
191 * to break any level I oplocks, the state is transitioned to OPEN_FILE.
192 * Otherwise ifs_createfile will finish asynchronously and the open is
193 * deferred. When the necessary level I opocks have been broken, and the
194 * open can be done, an event is sent by the kernel on the oplock event
195 * channel, which is handled by semlock_available_handler. At this point
196 * the deferred open is retried. Unless a level I oplock was acquired by
197 * another client, ifs_createfile will now complete synchronously.
199 * 2. OPEN_FILE: Once ifs_createfile completes, the callback record is
200 * transitioned to this state via onefs_set_oplock_callback.
202 uint64_t onefs_oplock_wait_record(struct smbd_server_connection *sconn,
203 uint64_t mid)
205 struct onefs_callback_record *result;
206 static uint64_t id_generator = 0;
208 if (!(result = SMB_MALLOC_P(struct onefs_callback_record))) {
209 DEBUG(0, ("talloc failed\n"));
210 return 0;
213 memset(result, '\0', sizeof(result));
215 id_generator += 1;
216 if (id_generator == 0) {
217 /* Wow, that's a long-running smbd... */
218 id_generator += 1;
221 result->sconn = sconn;
222 result->id = id_generator;
224 result->state = ONEFS_WAITING_FOR_OPLOCK;
225 result->data.mid = mid;
226 DLIST_ADD(callback_recs, result);
228 DEBUG(10, ("New cb rec %llu created\n", result->id));
230 return result->id;
234 * Transition the callback record state to OPEN_FILE.
236 * This is called after the file is opened and an fsp struct has been
237 * allocated. The mid is dropped in favor of storing the fsp.
239 void onefs_set_oplock_callback(uint64_t id, files_struct *fsp)
241 struct onefs_callback_record *cb;
242 char *msg;
244 DEBUG(10, ("onefs_set_oplock_callback called for cb rec %llu\n", id));
246 if (!(cb = onefs_find_cb(id, ONEFS_WAITING_FOR_OPLOCK))) {
247 if (asprintf(&msg, "Got invalid callback %lld\n", id) != -1) {
248 smb_panic(msg);
250 smb_panic("Got invalid callback id\n");
254 * Paranoia check
256 if (open_was_deferred(cb->data.mid)) {
257 if (asprintf(&msg, "Trying to upgrade callback for deferred "
258 "open mid=%llu\n", (unsigned long long)cb->data.mid) != -1) {
259 smb_panic(msg);
261 smb_panic("Trying to upgrade callback for deferred open "
262 "mid\n");
265 cb->state = ONEFS_OPEN_FILE;
266 cb->data.fsp = fsp;
270 * Using a callback record, initialize a share mode entry to pass to
271 * share_mode_entry_to_message to send samba IPC messages.
273 static void init_share_mode_entry(struct share_mode_entry *sme,
274 struct onefs_callback_record *cb,
275 int op_type)
277 ZERO_STRUCT(*sme);
279 sme->pid = procid_self();
280 sme->op_type = op_type;
281 sme->id = cb->data.fsp->file_id;
282 sme->share_file_id = cb->data.fsp->fh->gen_id;
286 * Callback when a break-to-none event is received from the kernel.
288 * On OneFS level 1 oplocks are always broken to level 2 first, therefore an
289 * async level 2 break message is always sent when breaking to none. The
290 * downside of this is that OneFS currently has no way to express breaking
291 * directly from level 1 to none.
293 static void oplock_break_to_none_handler(uint64_t id)
295 struct onefs_callback_record *cb;
296 struct share_mode_entry sme;
297 char msg[MSG_SMB_SHARE_MODE_ENTRY_SIZE];
299 DEBUG(10, ("oplock_break_to_none_handler called for id %llu\n", id));
301 if (!(cb = onefs_find_cb(id, ONEFS_OPEN_FILE))) {
302 DEBUG(3, ("oplock_break_to_none_handler: could not find "
303 "callback id %llu\n", id));
304 return;
307 DEBUG(10, ("oplock_break_to_none_handler called for file %s\n",
308 fsp_str_dbg(cb->data.fsp)));
310 init_share_mode_entry(&sme, cb, FORCE_OPLOCK_BREAK_TO_NONE);
311 share_mode_entry_to_message(msg, &sme);
312 messaging_send_buf(cb->sconn->msg_ctx,
313 sme.pid,
314 MSG_SMB_ASYNC_LEVEL2_BREAK,
315 (uint8_t *)msg,
316 MSG_SMB_SHARE_MODE_ENTRY_SIZE);
319 * We could still receive an OPLOCK_REVOKED message, so keep the
320 * oplock_callback_id around.
325 * Callback when a break-to-level2 event is received from the kernel.
327 * Breaks from level 1 to level 2.
329 static void oplock_break_to_level_two_handler(uint64_t id)
331 struct onefs_callback_record *cb;
332 struct share_mode_entry sme;
333 char msg[MSG_SMB_SHARE_MODE_ENTRY_SIZE];
335 DEBUG(10, ("oplock_break_to_level_two_handler called for id %llu\n",
336 id));
338 if (!(cb = onefs_find_cb(id, ONEFS_OPEN_FILE))) {
339 DEBUG(3, ("oplock_break_to_level_two_handler: could not find "
340 "callback id %llu\n", id));
341 return;
344 DEBUG(10, ("oplock_break_to_level_two_handler called for file %s\n",
345 fsp_str_dbg(cb->data.fsp)));
347 init_share_mode_entry(&sme, cb, LEVEL_II_OPLOCK);
348 share_mode_entry_to_message(msg, &sme);
349 messaging_send_buf(cb->sconn->msg_ctx,
350 sme.pid,
351 MSG_SMB_BREAK_REQUEST,
352 (uint8_t *)msg,
353 MSG_SMB_SHARE_MODE_ENTRY_SIZE);
356 * We could still receive an OPLOCK_REVOKED or OPLOCK_BREAK_TO_NONE
357 * message, so keep the oplock_callback_id around.
362 * Revoke an oplock from an unresponsive client.
364 * The kernel will send this message when it times out waiting for a level 1
365 * oplock break to be acknowledged by the client. The oplock is then
366 * immediately removed.
368 static void oplock_revoked_handler(uint64_t id)
370 struct onefs_callback_record *cb;
371 files_struct *fsp = NULL;
373 DEBUG(10, ("oplock_revoked_handler called for id %llu\n", id));
375 if (!(cb = onefs_find_cb(id, ONEFS_OPEN_FILE))) {
376 DEBUG(3, ("oplock_revoked_handler: could not find "
377 "callback id %llu\n", id));
378 return;
381 fsp = cb->data.fsp;
383 SMB_ASSERT(fsp->oplock_timeout == NULL);
385 DEBUG(0,("Level 1 oplock break failed for file %s. Forcefully "
386 "revoking oplock\n", fsp_str_dbg(fsp)));
388 remove_oplock(fsp);
391 * cb record is cleaned up in fsp ext data destructor on close, so
392 * leave it in the list.
397 * Asynchronous ifs_createfile callback
399 * If ifs_createfile had to asynchronously break any oplocks, this function is
400 * called when the kernel sends an event that the open can be retried.
402 static void semlock_available_handler(uint64_t id)
404 struct onefs_callback_record *cb;
406 DEBUG(10, ("semlock_available_handler called: %llu\n", id));
408 if (!(cb = onefs_find_cb(id, ONEFS_WAITING_FOR_OPLOCK))) {
409 DEBUG(5, ("semlock_available_handler: Did not find callback "
410 "%llu\n", id));
411 return;
414 DEBUG(10, ("Got semlock available for mid %llu\n",
415 (unsigned long long)cb->data.mid));
417 /* Paranoia check */
418 if (!(open_was_deferred(cb->data.mid))) {
419 char *msg;
420 if (asprintf(&msg, "Semlock available on an open that wasn't "
421 "deferred: %s\n",
422 onefs_cb_record_str_dbg(cb)) != -1) {
423 smb_panic(msg);
425 smb_panic("Semlock available on an open that wasn't "
426 "deferred\n");
429 schedule_deferred_open_smb_message(cb->data.mid);
431 /* Cleanup the callback record since the open will be retried. */
432 destroy_onefs_callback_record(id);
434 return;
438 * Asynchronous ifs_createfile failure callback
440 * If ifs_createfile had to asynchronously break any oplocks, but an error was
441 * encountered in the kernel, the open will be retried with the state->failed
442 * set to true. This will prompt the open path to send an INTERNAL_ERROR
443 * error message to the client.
445 static void semlock_async_failure_handler(uint64_t id)
447 struct onefs_callback_record *cb;
448 struct deferred_open_record *state;
450 DEBUG(1, ("semlock_async_failure_handler called: %llu\n", id));
452 if (!(cb = onefs_find_cb(id, ONEFS_WAITING_FOR_OPLOCK))) {
453 DEBUG(5, ("semlock_async_failure_handler: Did not find callback "
454 "%llu\n", id));
455 return;
458 DEBUG(1, ("Got semlock_async_failure message for mid %llu\n",
459 (unsigned long long)cb->data.mid));
461 /* Paranoia check */
462 if (!(open_was_deferred(cb->data.mid))) {
463 char *msg;
464 if (asprintf(&msg, "Semlock failure on an open that wasn't "
465 "deferred: %s\n",
466 onefs_cb_record_str_dbg(cb)) != -1) {
467 smb_panic(msg);
469 smb_panic("Semlock failure on an open that wasn't deferred\n");
472 /* Find the actual deferred open record. */
473 if (!get_open_deferred_message_state(cb->data.mid, NULL, &state)) {
474 DEBUG(0, ("Could not find deferred request for "
475 "mid %d\n", cb->data.mid));
476 destroy_onefs_callback_record(id);
477 return;
480 /* Update to failed so the client can be notified on retried open. */
481 state->failed = true;
483 /* Schedule deferred open for immediate retry. */
484 schedule_deferred_open_smb_message(cb->data.mid);
486 /* Cleanup the callback record here since the open will be retried. */
487 destroy_onefs_callback_record(id);
489 return;
493 * OneFS acquires all oplocks via ifs_createfile, so this is a no-op.
495 static bool onefs_set_kernel_oplock(struct kernel_oplocks *_ctx,
496 files_struct *fsp, int oplock_type) {
497 return true;
501 * Release the kernel oplock.
503 static void onefs_release_kernel_oplock(struct kernel_oplocks *_ctx,
504 files_struct *fsp, int oplock_type)
506 enum oplock_type oplock = onefs_samba_oplock_to_oplock(oplock_type);
508 DEBUG(10, ("onefs_release_kernel_oplock: Releasing %s to type %s\n",
509 fsp_str_dbg(fsp), onefs_oplock_str(oplock)));
511 if (fsp->fh->fd == -1) {
512 DEBUG(1, ("no fd\n"));
513 return;
516 /* Downgrade oplock to either SHARED or NONE. */
517 if (ifs_oplock_downgrade(fsp->fh->fd, oplock)) {
518 DEBUG(1,("ifs_oplock_downgrade failed: %s\n",
519 strerror(errno)));
524 * Wrap ifs_semlock_write so it is only called on operations that aren't
525 * already contended in the kernel.
527 static void onefs_semlock_write(int fd, enum level2_contention_type type,
528 enum semlock_operation semlock_op)
530 int ret;
532 switch (type) {
533 case LEVEL2_CONTEND_ALLOC_GROW:
534 case LEVEL2_CONTEND_POSIX_BRL:
535 DEBUG(10, ("Taking %d write semlock for cmd %d on fd: %d\n",
536 semlock_op, type, fd));
537 ret = ifs_semlock_write(fd, semlock_op);
538 if (ret) {
539 DEBUG(0,("ifs_semlock_write failed taking %d write "
540 "semlock for cmd %d on fd: %d: %s",
541 semlock_op, type, fd, strerror(errno)));
543 break;
544 default:
545 DEBUG(10, ("Skipping write semlock for cmd %d on fd: %d\n",
546 type, fd));
551 * Contend level 2 oplocks in the kernel and smbd.
553 * Taking a write semlock will contend all level 2 oplocks in all smbds across
554 * the cluster except the fsp's own level 2 oplock. This lack of
555 * self-contention is a limitation of the current OneFS kernel oplocks
556 * implementation. Luckily it is easy to contend our own level 2 oplock by
557 * checking the the fsp's oplock_type. If it's a level2, send a break message
558 * to the client and remove the oplock.
560 static void onefs_contend_level2_oplocks_begin(files_struct *fsp,
561 enum level2_contention_type type)
563 /* Take care of level 2 kernel contention. */
564 onefs_semlock_write(fsp->fh->fd, type, SEMLOCK_LOCK);
566 /* Take care of level 2 self contention. */
567 if (LEVEL_II_OPLOCK_TYPE(fsp->oplock_type))
568 break_level2_to_none_async(fsp);
572 * Unlock the write semlock when the level 2 contending operation ends.
574 static void onefs_contend_level2_oplocks_end(files_struct *fsp,
575 enum level2_contention_type type)
577 /* Take care of level 2 kernel contention. */
578 onefs_semlock_write(fsp->fh->fd, type, SEMLOCK_UNLOCK);
582 * Return string value of onefs oplock types.
584 const char *onefs_oplock_str(enum oplock_type onefs_oplock_type)
586 switch (onefs_oplock_type) {
587 case OPLOCK_NONE:
588 return "OPLOCK_NONE";
589 case OPLOCK_EXCLUSIVE:
590 return "OPLOCK_EXCLUSIVE";
591 case OPLOCK_BATCH:
592 return "OPLOCK_BATCH";
593 case OPLOCK_SHARED:
594 return "OPLOCK_SHARED";
595 default:
596 break;
598 return "UNKNOWN";
602 * Convert from onefs to samba oplock.
604 int onefs_oplock_to_samba_oplock(enum oplock_type onefs_oplock)
606 switch (onefs_oplock) {
607 case OPLOCK_NONE:
608 return NO_OPLOCK;
609 case OPLOCK_EXCLUSIVE:
610 return EXCLUSIVE_OPLOCK;
611 case OPLOCK_BATCH:
612 return BATCH_OPLOCK;
613 case OPLOCK_SHARED:
614 return LEVEL_II_OPLOCK;
615 default:
616 DEBUG(0, ("unknown oplock type %d found\n", onefs_oplock));
617 break;
619 return NO_OPLOCK;
623 * Convert from samba to onefs oplock.
625 enum oplock_type onefs_samba_oplock_to_oplock(int samba_oplock_type)
627 if (BATCH_OPLOCK_TYPE(samba_oplock_type)) return OPLOCK_BATCH;
628 if (EXCLUSIVE_OPLOCK_TYPE(samba_oplock_type)) return OPLOCK_EXCLUSIVE;
629 if (LEVEL_II_OPLOCK_TYPE(samba_oplock_type)) return OPLOCK_SHARED;
630 return OPLOCK_NONE;
634 * Oplock event handler.
636 * Call into the event system dispatcher to handle each event.
638 static void onefs_oplocks_read_fde_handler(struct event_context *ev,
639 struct fd_event *fde,
640 uint16_t flags,
641 void *private_data)
643 struct onefs_oplocks_context *ctx =
644 talloc_get_type(private_data, struct onefs_oplocks_context);
646 if (oplocks_event_dispatcher(ctx->onefs_ops)) {
647 DEBUG(0, ("oplocks_event_dispatcher failed: %s\n",
648 strerror(errno)));
653 * Setup kernel oplocks
655 static const struct kernel_oplocks_ops onefs_koplocks_ops = {
656 .set_oplock = onefs_set_kernel_oplock,
657 .release_oplock = onefs_release_kernel_oplock,
658 .contend_level2_oplocks_begin = onefs_contend_level2_oplocks_begin,
659 .contend_level2_oplocks_end = onefs_contend_level2_oplocks_end,
662 static const struct oplocks_event_ops onefs_dispatch_ops = {
663 .oplock_break_to_none = oplock_break_to_none_handler,
664 .oplock_break_to_level_two = oplock_break_to_level_two_handler,
665 .oplock_revoked = oplock_revoked_handler,
666 .semlock_available = semlock_available_handler,
667 .semlock_async_failure = semlock_async_failure_handler,
670 struct kernel_oplocks *onefs_init_kernel_oplocks(struct smbd_server_connection *sconn)
672 struct kernel_oplocks *_ctx = NULL;
673 struct onefs_oplocks_context *ctx = NULL;
674 struct procoptions po = PROCOPTIONS_INIT;
676 DEBUG(10, ("onefs_init_kernel_oplocks called\n"));
678 /* Set the non-blocking proc flag */
679 po.po_flags_on |= P_NON_BLOCKING_SEMLOCK;
680 if (setprocoptions(&po) != 0) {
681 DEBUG(0, ("setprocoptions failed: %s.\n", strerror(errno)));
682 return NULL;
685 /* Setup the oplock contexts */
686 _ctx = talloc_zero(mem_ctx, struct kernel_oplocks);
687 if (!_ctx) {
688 return NULL;
691 ctx = talloc_zero(_ctx, struct onefs_oplocks_context);
692 if (!ctx) {
693 goto err_out;
695 ctx->sconn = sconn;
697 _ctx->ops = &onefs_koplocks_ops;
698 _ctx->flags = (KOPLOCKS_LEVEL2_SUPPORTED |
699 KOPLOCKS_DEFERRED_OPEN_NOTIFICATION |
700 KOPLOCKS_TIMEOUT_NOTIFICATION |
701 KOPLOCKS_OPLOCK_BROKEN_NOTIFICATION);
702 _ctx->private_data = ctx;
703 ctx->ctx = _ctx;
704 ctx->onefs_ops = &onefs_dispatch_ops;
706 /* Register an kernel event channel for oplocks */
707 ctx->onefs_event_fd = oplocks_event_register();
708 if (ctx->onefs_event_fd == -1) {
709 DEBUG(0, ("oplocks_event_register failed: %s\n",
710 strerror(errno)));
711 goto err_out;
714 DEBUG(10, ("oplock event_fd = %d\n", ctx->onefs_event_fd));
716 /* Register the oplock event_fd with samba's event system */
717 ctx->read_fde = event_add_fd(sconn->ev_ctx,
718 ctx,
719 ctx->onefs_event_fd,
720 EVENT_FD_READ,
721 onefs_oplocks_read_fde_handler,
722 ctx);
723 return _ctx;
725 err_out:
726 talloc_free(_ctx);
727 return NULL;
730 #else
731 void oplock_onefs_dummy(void);
732 void oplock_onefs_dummy(void) {}
733 #endif /* HAVE_ONEFS */