s3: Fix bug #9085.
[Samba.git] / source3 / lib / g_lock.c
blob356c104bd151b5ae175fe04eca76f141cfd3b065
1 /*
2 Unix SMB/CIFS implementation.
3 global locks based on dbwrap and messaging
4 Copyright (C) 2009 by Volker Lendecke
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program. If not, see <http://www.gnu.org/licenses/>.
20 #include "includes.h"
21 #include "g_lock.h"
23 static NTSTATUS g_lock_force_unlock(struct g_lock_ctx *ctx, const char *name,
24 struct server_id pid);
26 struct g_lock_ctx {
27 struct db_context *db;
28 struct messaging_context *msg;
32 * The "g_lock.tdb" file contains records, indexed by the 0-terminated
33 * lockname. The record contains an array of "struct g_lock_rec"
34 * structures. Waiters have the lock_type with G_LOCK_PENDING or'ed.
37 struct g_lock_rec {
38 enum g_lock_type lock_type;
39 struct server_id pid;
42 struct g_lock_ctx *g_lock_ctx_init(TALLOC_CTX *mem_ctx,
43 struct messaging_context *msg)
45 struct g_lock_ctx *result;
47 result = talloc(mem_ctx, struct g_lock_ctx);
48 if (result == NULL) {
49 return NULL;
51 result->msg = msg;
53 result->db = db_open(result, lock_path("g_lock.tdb"), 0,
54 TDB_CLEAR_IF_FIRST, O_RDWR|O_CREAT, 0700);
55 if (result->db == NULL) {
56 DEBUG(1, ("g_lock_init: Could not open g_lock.tdb"));
57 TALLOC_FREE(result);
58 return NULL;
60 return result;
63 static bool g_lock_conflicts(enum g_lock_type lock_type,
64 const struct g_lock_rec *rec)
66 enum g_lock_type rec_lock = rec->lock_type;
68 if ((rec_lock & G_LOCK_PENDING) != 0) {
69 return false;
73 * Only tested write locks so far. Very likely this routine
74 * needs to be fixed for read locks....
76 if ((lock_type == G_LOCK_READ) && (rec_lock == G_LOCK_READ)) {
77 return false;
79 return true;
82 static bool g_lock_parse(TALLOC_CTX *mem_ctx, TDB_DATA data,
83 int *pnum_locks, struct g_lock_rec **plocks)
85 int i, num_locks;
86 struct g_lock_rec *locks;
88 if ((data.dsize % sizeof(struct g_lock_rec)) != 0) {
89 DEBUG(1, ("invalid lock record length %d\n", (int)data.dsize));
90 return false;
93 num_locks = data.dsize / sizeof(struct g_lock_rec);
94 locks = talloc_array(mem_ctx, struct g_lock_rec, num_locks);
95 if (locks == NULL) {
96 DEBUG(1, ("talloc failed\n"));
97 return false;
100 memcpy(locks, data.dptr, data.dsize);
102 DEBUG(10, ("locks:\n"));
103 for (i=0; i<num_locks; i++) {
104 DEBUGADD(10, ("%s: %s %s\n",
105 procid_str(talloc_tos(), &locks[i].pid),
106 ((locks[i].lock_type & 1) == G_LOCK_READ) ?
107 "read" : "write",
108 (locks[i].lock_type & G_LOCK_PENDING) ?
109 "(pending)" : "(owner)"));
111 if (((locks[i].lock_type & G_LOCK_PENDING) == 0)
112 && !process_exists(locks[i].pid)) {
114 DEBUGADD(10, ("lock owner %s died -- discarding\n",
115 procid_str(talloc_tos(),
116 &locks[i].pid)));
118 if (i < (num_locks-1)) {
119 locks[i] = locks[num_locks-1];
121 num_locks -= 1;
125 *plocks = locks;
126 *pnum_locks = num_locks;
127 return true;
130 static void g_lock_cleanup(int *pnum_locks, struct g_lock_rec *locks)
132 int i, num_locks;
134 num_locks = *pnum_locks;
136 DEBUG(10, ("g_lock_cleanup: %d locks\n", num_locks));
138 for (i=0; i<num_locks; i++) {
139 if (process_exists(locks[i].pid)) {
140 continue;
142 DEBUGADD(10, ("%s does not exist -- discarding\n",
143 procid_str(talloc_tos(), &locks[i].pid)));
145 if (i < (num_locks-1)) {
146 locks[i] = locks[num_locks-1];
148 num_locks -= 1;
150 *pnum_locks = num_locks;
151 return;
154 static struct g_lock_rec *g_lock_addrec(TALLOC_CTX *mem_ctx,
155 struct g_lock_rec *locks,
156 int *pnum_locks,
157 const struct server_id pid,
158 enum g_lock_type lock_type)
160 struct g_lock_rec *result;
161 int num_locks = *pnum_locks;
163 result = talloc_realloc(mem_ctx, locks, struct g_lock_rec,
164 num_locks+1);
165 if (result == NULL) {
166 return NULL;
169 result[num_locks].pid = pid;
170 result[num_locks].lock_type = lock_type;
171 *pnum_locks += 1;
172 return result;
175 static void g_lock_got_retry(struct messaging_context *msg,
176 void *private_data,
177 uint32_t msg_type,
178 struct server_id server_id,
179 DATA_BLOB *data);
181 static NTSTATUS g_lock_trylock(struct g_lock_ctx *ctx, const char *name,
182 enum g_lock_type lock_type)
184 struct db_record *rec = NULL;
185 struct g_lock_rec *locks = NULL;
186 int i, num_locks;
187 struct server_id self;
188 int our_index;
189 TDB_DATA data;
190 NTSTATUS status = NT_STATUS_OK;
191 NTSTATUS store_status;
193 again:
194 rec = ctx->db->fetch_locked(ctx->db, talloc_tos(),
195 string_term_tdb_data(name));
196 if (rec == NULL) {
197 DEBUG(10, ("fetch_locked(\"%s\") failed\n", name));
198 status = NT_STATUS_LOCK_NOT_GRANTED;
199 goto done;
202 if (!g_lock_parse(talloc_tos(), rec->value, &num_locks, &locks)) {
203 DEBUG(10, ("g_lock_parse for %s failed\n", name));
204 status = NT_STATUS_INTERNAL_ERROR;
205 goto done;
208 self = procid_self();
209 our_index = -1;
211 for (i=0; i<num_locks; i++) {
212 if (procid_equal(&self, &locks[i].pid)) {
213 if (our_index != -1) {
214 DEBUG(1, ("g_lock_trylock: Added ourself "
215 "twice!\n"));
216 status = NT_STATUS_INTERNAL_ERROR;
217 goto done;
219 if ((locks[i].lock_type & G_LOCK_PENDING) == 0) {
220 DEBUG(1, ("g_lock_trylock: Found ourself not "
221 "pending!\n"));
222 status = NT_STATUS_INTERNAL_ERROR;
223 goto done;
226 our_index = i;
228 /* never conflict with ourself */
229 continue;
231 if (g_lock_conflicts(lock_type, &locks[i])) {
232 struct server_id pid = locks[i].pid;
234 if (!process_exists(pid)) {
235 TALLOC_FREE(locks);
236 TALLOC_FREE(rec);
237 status = g_lock_force_unlock(ctx, name, pid);
238 if (!NT_STATUS_IS_OK(status)) {
239 DEBUG(1, ("Could not unlock dead lock "
240 "holder!\n"));
241 goto done;
243 goto again;
245 lock_type |= G_LOCK_PENDING;
249 if (our_index == -1) {
250 /* First round, add ourself */
252 locks = g_lock_addrec(talloc_tos(), locks, &num_locks,
253 self, lock_type);
254 if (locks == NULL) {
255 DEBUG(10, ("g_lock_addrec failed\n"));
256 status = NT_STATUS_NO_MEMORY;
257 goto done;
259 } else {
261 * Retry. We were pending last time. Overwrite the
262 * stored lock_type with what we calculated, we might
263 * have acquired the lock this time.
265 locks[our_index].lock_type = lock_type;
268 if (NT_STATUS_IS_OK(status) && ((lock_type & G_LOCK_PENDING) == 0)) {
270 * Walk through the list of locks, search for dead entries
272 g_lock_cleanup(&num_locks, locks);
275 data = make_tdb_data((uint8_t *)locks, num_locks * sizeof(*locks));
276 store_status = rec->store(rec, data, 0);
277 if (!NT_STATUS_IS_OK(store_status)) {
278 DEBUG(1, ("rec->store failed: %s\n",
279 nt_errstr(store_status)));
280 status = store_status;
283 done:
284 TALLOC_FREE(locks);
285 TALLOC_FREE(rec);
287 if (NT_STATUS_IS_OK(status) && (lock_type & G_LOCK_PENDING) != 0) {
288 return STATUS_PENDING;
291 return NT_STATUS_OK;
294 NTSTATUS g_lock_lock(struct g_lock_ctx *ctx, const char *name,
295 enum g_lock_type lock_type, struct timeval timeout)
297 struct tevent_timer *te = NULL;
298 NTSTATUS status;
299 bool retry = false;
300 struct timeval timeout_end;
301 struct timeval time_now;
303 DEBUG(10, ("Trying to acquire lock %d for %s\n", (int)lock_type,
304 name));
306 if (lock_type & ~1) {
307 DEBUG(1, ("Got invalid lock type %d for %s\n",
308 (int)lock_type, name));
309 return NT_STATUS_INVALID_PARAMETER;
312 #ifdef CLUSTER_SUPPORT
313 if (lp_clustering()) {
314 status = ctdb_watch_us(messaging_ctdbd_connection());
315 if (!NT_STATUS_IS_OK(status)) {
316 DEBUG(10, ("could not register retry with ctdb: %s\n",
317 nt_errstr(status)));
318 goto done;
321 #endif
323 status = messaging_register(ctx->msg, &retry, MSG_DBWRAP_G_LOCK_RETRY,
324 g_lock_got_retry);
325 if (!NT_STATUS_IS_OK(status)) {
326 DEBUG(10, ("messaging_register failed: %s\n",
327 nt_errstr(status)));
328 return status;
331 time_now = timeval_current();
332 timeout_end = timeval_sum(&time_now, &timeout);
334 while (true) {
335 #ifdef CLUSTER_SUPPORT
336 fd_set _r_fds;
337 #endif
338 fd_set *r_fds = NULL;
339 int max_fd = 0;
340 int ret;
341 struct timeval timeout_remaining, select_timeout;
343 status = g_lock_trylock(ctx, name, lock_type);
344 if (NT_STATUS_IS_OK(status)) {
345 DEBUG(10, ("Got lock %s\n", name));
346 break;
348 if (!NT_STATUS_EQUAL(status, STATUS_PENDING)) {
349 DEBUG(10, ("g_lock_trylock failed: %s\n",
350 nt_errstr(status)));
351 break;
354 DEBUG(10, ("g_lock_trylock: Did not get lock, waiting...\n"));
356 /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
357 * !!! HACK ALERT --- FIX ME !!!
358 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
359 * What we really want to do here is to react to
360 * MSG_DBWRAP_G_LOCK_RETRY messages that are either sent
361 * by a client doing g_lock_unlock or by ourselves when
362 * we receive a CTDB_SRVID_SAMBA_NOTIFY or
363 * CTDB_SRVID_RECONFIGURE message from ctdbd, i.e. when
364 * either a client holding a lock or a complete node
365 * has died.
367 * Doing this properly involves calling tevent_loop_once(),
368 * but doing this here with the main ctdbd messaging context
369 * creates a nested event loop when g_lock_lock() is called
370 * from the main event loop, e.g. in a tcon_and_X where the
371 * share_info.tdb needs to be initialized and is locked by
372 * another process, or when the remore registry is accessed
373 * for writing and some other process already holds a lock
374 * on the registry.tdb.
376 * So as a quick fix, we act a little coarsely here: we do
377 * a select on the ctdb connection fd and when it is readable
378 * or we get EINTR, then we retry without actually parsing
379 * any ctdb packages or dispatching messages. This means that
380 * we retry more often than intended by design, but this does
381 * not harm and it is unobtrusive. When we have finished,
382 * the main loop will pick up all the messages and ctdb
383 * packets. The only extra twist is that we cannot use timed
384 * events here but have to handcode a timeout.
387 #ifdef CLUSTER_SUPPORT
388 if (lp_clustering()) {
389 struct ctdbd_connection *conn = messaging_ctdbd_connection();
391 r_fds = &_r_fds;
392 FD_ZERO(r_fds);
393 max_fd = ctdbd_conn_get_fd(conn);
394 if (max_fd >= 0 && max_fd < FD_SETSIZE) {
395 FD_SET(max_fd, r_fds);
398 #endif
400 time_now = timeval_current();
401 timeout_remaining = timeval_until(&time_now, &timeout_end);
402 select_timeout = timeval_set(60, 0);
404 select_timeout = timeval_min(&select_timeout,
405 &timeout_remaining);
407 ret = sys_select(max_fd + 1, r_fds, NULL, NULL,
408 &select_timeout);
409 if (ret == -1) {
410 if (errno != EINTR) {
411 DEBUG(1, ("error calling select: %s\n",
412 strerror(errno)));
413 status = NT_STATUS_INTERNAL_ERROR;
414 break;
417 * errno == EINTR:
418 * This means a signal was received.
419 * It might have been a MSG_DBWRAP_G_LOCK_RETRY message.
420 * ==> retry
422 } else if (ret == 0) {
423 if (timeval_expired(&timeout_end)) {
424 DEBUG(10, ("g_lock_lock timed out\n"));
425 status = NT_STATUS_LOCK_NOT_GRANTED;
426 break;
427 } else {
428 DEBUG(10, ("select returned 0 but timeout not "
429 "not expired, retrying\n"));
431 } else if (ret != 1) {
432 DEBUG(1, ("invalid return code of select: %d\n", ret));
433 status = NT_STATUS_INTERNAL_ERROR;
434 break;
437 * ret == 1:
438 * This means ctdbd has sent us some data.
439 * Might be a CTDB_SRVID_RECONFIGURE or a
440 * CTDB_SRVID_SAMBA_NOTIFY message.
441 * ==> retry
445 #ifdef CLUSTER_SUPPORT
446 done:
447 #endif
449 if (!NT_STATUS_IS_OK(status)) {
450 NTSTATUS unlock_status;
452 unlock_status = g_lock_unlock(ctx, name);
454 if (!NT_STATUS_IS_OK(unlock_status)) {
455 DEBUG(1, ("Could not remove ourself from the locking "
456 "db: %s\n", nt_errstr(status)));
460 messaging_deregister(ctx->msg, MSG_DBWRAP_G_LOCK_RETRY, &retry);
461 TALLOC_FREE(te);
463 return status;
466 static void g_lock_got_retry(struct messaging_context *msg,
467 void *private_data,
468 uint32_t msg_type,
469 struct server_id server_id,
470 DATA_BLOB *data)
472 bool *pretry = (bool *)private_data;
474 DEBUG(10, ("Got retry message from pid %s\n",
475 procid_str(talloc_tos(), &server_id)));
477 *pretry = true;
480 static NTSTATUS g_lock_force_unlock(struct g_lock_ctx *ctx, const char *name,
481 struct server_id pid)
483 struct db_record *rec = NULL;
484 struct g_lock_rec *locks = NULL;
485 int i, num_locks;
486 enum g_lock_type lock_type;
487 NTSTATUS status;
489 rec = ctx->db->fetch_locked(ctx->db, talloc_tos(),
490 string_term_tdb_data(name));
491 if (rec == NULL) {
492 DEBUG(10, ("fetch_locked(\"%s\") failed\n", name));
493 status = NT_STATUS_INTERNAL_ERROR;
494 goto done;
497 if (!g_lock_parse(talloc_tos(), rec->value, &num_locks, &locks)) {
498 DEBUG(10, ("g_lock_parse for %s failed\n", name));
499 status = NT_STATUS_INTERNAL_ERROR;
500 goto done;
503 for (i=0; i<num_locks; i++) {
504 if (procid_equal(&pid, &locks[i].pid)) {
505 break;
509 if (i == num_locks) {
510 DEBUG(10, ("g_lock_force_unlock: Lock not found\n"));
511 status = NT_STATUS_INTERNAL_ERROR;
512 goto done;
515 lock_type = locks[i].lock_type;
517 if (i < (num_locks-1)) {
518 locks[i] = locks[num_locks-1];
520 num_locks -= 1;
522 if (num_locks == 0) {
523 status = rec->delete_rec(rec);
524 } else {
525 TDB_DATA data;
526 data = make_tdb_data((uint8_t *)locks,
527 sizeof(struct g_lock_rec) * num_locks);
528 status = rec->store(rec, data, 0);
531 if (!NT_STATUS_IS_OK(status)) {
532 DEBUG(1, ("g_lock_force_unlock: Could not store record: %s\n",
533 nt_errstr(status)));
534 goto done;
537 TALLOC_FREE(rec);
539 if ((lock_type & G_LOCK_PENDING) == 0) {
540 int num_wakeups = 0;
543 * We've been the lock holder. Others to retry. Don't
544 * tell all others to avoid a thundering herd. In case
545 * this leads to a complete stall because we miss some
546 * processes, the loop in g_lock_lock tries at least
547 * once a minute.
550 for (i=0; i<num_locks; i++) {
551 if ((locks[i].lock_type & G_LOCK_PENDING) == 0) {
552 continue;
554 if (!process_exists(locks[i].pid)) {
555 continue;
559 * Ping all waiters to retry
561 status = messaging_send(ctx->msg, locks[i].pid,
562 MSG_DBWRAP_G_LOCK_RETRY,
563 &data_blob_null);
564 if (!NT_STATUS_IS_OK(status)) {
565 DEBUG(1, ("sending retry to %s failed: %s\n",
566 procid_str(talloc_tos(),
567 &locks[i].pid),
568 nt_errstr(status)));
569 } else {
570 num_wakeups += 1;
572 if (num_wakeups > 5) {
573 break;
577 done:
579 * For the error path, TALLOC_FREE(rec) as well. In the good
580 * path we have already freed it.
582 TALLOC_FREE(rec);
584 TALLOC_FREE(locks);
585 return status;
588 NTSTATUS g_lock_unlock(struct g_lock_ctx *ctx, const char *name)
590 NTSTATUS status;
592 status = g_lock_force_unlock(ctx, name, procid_self());
594 #ifdef CLUSTER_SUPPORT
595 if (lp_clustering()) {
596 ctdb_unwatch(messaging_ctdbd_connection());
598 #endif
599 return status;
602 struct g_lock_locks_state {
603 int (*fn)(const char *name, void *private_data);
604 void *private_data;
607 static int g_lock_locks_fn(struct db_record *rec, void *priv)
609 struct g_lock_locks_state *state = (struct g_lock_locks_state *)priv;
611 if ((rec->key.dsize == 0) || (rec->key.dptr[rec->key.dsize-1] != 0)) {
612 DEBUG(1, ("invalid key in g_lock.tdb, ignoring\n"));
613 return 0;
615 return state->fn((char *)rec->key.dptr, state->private_data);
618 int g_lock_locks(struct g_lock_ctx *ctx,
619 int (*fn)(const char *name, void *private_data),
620 void *private_data)
622 struct g_lock_locks_state state;
624 state.fn = fn;
625 state.private_data = private_data;
627 return ctx->db->traverse_read(ctx->db, g_lock_locks_fn, &state);
630 NTSTATUS g_lock_dump(struct g_lock_ctx *ctx, const char *name,
631 int (*fn)(struct server_id pid,
632 enum g_lock_type lock_type,
633 void *private_data),
634 void *private_data)
636 TDB_DATA data;
637 int i, num_locks;
638 struct g_lock_rec *locks = NULL;
639 bool ret;
641 if (ctx->db->fetch(ctx->db, talloc_tos(), string_term_tdb_data(name),
642 &data) != 0) {
643 return NT_STATUS_NOT_FOUND;
646 if ((data.dsize == 0) || (data.dptr == NULL)) {
647 return NT_STATUS_OK;
650 ret = g_lock_parse(talloc_tos(), data, &num_locks, &locks);
652 TALLOC_FREE(data.dptr);
654 if (!ret) {
655 DEBUG(10, ("g_lock_parse for %s failed\n", name));
656 return NT_STATUS_INTERNAL_ERROR;
659 for (i=0; i<num_locks; i++) {
660 if (fn(locks[i].pid, locks[i].lock_type, private_data) != 0) {
661 break;
664 TALLOC_FREE(locks);
665 return NT_STATUS_OK;
668 struct g_lock_get_state {
669 bool found;
670 struct server_id *pid;
673 static int g_lock_get_fn(struct server_id pid, enum g_lock_type lock_type,
674 void *priv)
676 struct g_lock_get_state *state = (struct g_lock_get_state *)priv;
678 if ((lock_type & G_LOCK_PENDING) != 0) {
679 return 0;
682 state->found = true;
683 *state->pid = pid;
684 return 1;
687 NTSTATUS g_lock_get(struct g_lock_ctx *ctx, const char *name,
688 struct server_id *pid)
690 struct g_lock_get_state state;
691 NTSTATUS status;
693 state.found = false;
694 state.pid = pid;
696 status = g_lock_dump(ctx, name, g_lock_get_fn, &state);
697 if (!NT_STATUS_IS_OK(status)) {
698 return status;
700 if (!state.found) {
701 return NT_STATUS_NOT_FOUND;
703 return NT_STATUS_OK;
706 static bool g_lock_init_all(TALLOC_CTX *mem_ctx,
707 struct tevent_context **pev,
708 struct messaging_context **pmsg,
709 struct g_lock_ctx **pg_ctx)
711 struct tevent_context *ev = NULL;
712 struct messaging_context *msg = NULL;
713 struct g_lock_ctx *g_ctx = NULL;
715 ev = tevent_context_init(mem_ctx);
716 if (ev == NULL) {
717 d_fprintf(stderr, "ERROR: could not init event context\n");
718 goto fail;
720 msg = messaging_init(mem_ctx, procid_self(), ev);
721 if (msg == NULL) {
722 d_fprintf(stderr, "ERROR: could not init messaging context\n");
723 goto fail;
725 g_ctx = g_lock_ctx_init(mem_ctx, msg);
726 if (g_ctx == NULL) {
727 d_fprintf(stderr, "ERROR: could not init g_lock context\n");
728 goto fail;
731 *pev = ev;
732 *pmsg = msg;
733 *pg_ctx = g_ctx;
734 return true;
735 fail:
736 TALLOC_FREE(g_ctx);
737 TALLOC_FREE(msg);
738 TALLOC_FREE(ev);
739 return false;
742 NTSTATUS g_lock_do(const char *name, enum g_lock_type lock_type,
743 struct timeval timeout,
744 void (*fn)(void *private_data), void *private_data)
746 struct tevent_context *ev = NULL;
747 struct messaging_context *msg = NULL;
748 struct g_lock_ctx *g_ctx = NULL;
749 NTSTATUS status;
751 if (!g_lock_init_all(talloc_tos(), &ev, &msg, &g_ctx)) {
752 status = NT_STATUS_ACCESS_DENIED;
753 goto done;
756 status = g_lock_lock(g_ctx, name, lock_type, timeout);
757 if (!NT_STATUS_IS_OK(status)) {
758 goto done;
760 fn(private_data);
761 g_lock_unlock(g_ctx, name);
763 done:
764 TALLOC_FREE(g_ctx);
765 TALLOC_FREE(msg);
766 TALLOC_FREE(ev);
767 return status;