2 Unix SMB/CIFS implementation.
3 global locks based on dbwrap and messaging
4 Copyright (C) 2009 by Volker Lendecke
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program. If not, see <http://www.gnu.org/licenses/>.
21 #include "system/filesys.h"
22 #include "dbwrap/dbwrap.h"
23 #include "dbwrap/dbwrap_open.h"
26 #include "ctdbd_conn.h"
27 #include "../lib/util/select.h"
28 #include "system/select.h"
31 static NTSTATUS
g_lock_force_unlock(struct g_lock_ctx
*ctx
, const char *name
,
32 struct server_id pid
);
35 struct db_context
*db
;
36 struct messaging_context
*msg
;
40 * The "g_lock.tdb" file contains records, indexed by the 0-terminated
41 * lockname. The record contains an array of "struct g_lock_rec"
42 * structures. Waiters have the lock_type with G_LOCK_PENDING or'ed.
46 enum g_lock_type lock_type
;
50 struct g_lock_ctx
*g_lock_ctx_init(TALLOC_CTX
*mem_ctx
,
51 struct messaging_context
*msg
)
53 struct g_lock_ctx
*result
;
55 result
= talloc(mem_ctx
, struct g_lock_ctx
);
61 result
->db
= db_open(result
, lock_path("g_lock.tdb"), 0,
62 TDB_CLEAR_IF_FIRST
|TDB_INCOMPATIBLE_HASH
,
63 O_RDWR
|O_CREAT
, 0600);
64 if (result
->db
== NULL
) {
65 DEBUG(1, ("g_lock_init: Could not open g_lock.tdb\n"));
72 static bool g_lock_conflicts(enum g_lock_type lock_type
,
73 const struct g_lock_rec
*rec
)
75 enum g_lock_type rec_lock
= rec
->lock_type
;
77 if ((rec_lock
& G_LOCK_PENDING
) != 0) {
82 * Only tested write locks so far. Very likely this routine
83 * needs to be fixed for read locks....
85 if ((lock_type
== G_LOCK_READ
) && (rec_lock
== G_LOCK_READ
)) {
91 static bool g_lock_parse(TALLOC_CTX
*mem_ctx
, TDB_DATA data
,
92 int *pnum_locks
, struct g_lock_rec
**plocks
)
95 struct g_lock_rec
*locks
;
97 if ((data
.dsize
% sizeof(struct g_lock_rec
)) != 0) {
98 DEBUG(1, ("invalid lock record length %d\n", (int)data
.dsize
));
102 num_locks
= data
.dsize
/ sizeof(struct g_lock_rec
);
103 locks
= talloc_array(mem_ctx
, struct g_lock_rec
, num_locks
);
105 DEBUG(1, ("talloc failed\n"));
109 memcpy(locks
, data
.dptr
, data
.dsize
);
111 DEBUG(10, ("locks:\n"));
112 for (i
=0; i
<num_locks
; i
++) {
113 DEBUGADD(10, ("%s: %s %s\n",
114 server_id_str(talloc_tos(), &locks
[i
].pid
),
115 ((locks
[i
].lock_type
& 1) == G_LOCK_READ
) ?
117 (locks
[i
].lock_type
& G_LOCK_PENDING
) ?
118 "(pending)" : "(owner)"));
120 if (((locks
[i
].lock_type
& G_LOCK_PENDING
) == 0)
121 && !process_exists(locks
[i
].pid
)) {
123 DEBUGADD(10, ("lock owner %s died -- discarding\n",
124 server_id_str(talloc_tos(),
127 if (i
< (num_locks
-1)) {
128 locks
[i
] = locks
[num_locks
-1];
135 *pnum_locks
= num_locks
;
139 static void g_lock_cleanup(int *pnum_locks
, struct g_lock_rec
*locks
)
143 num_locks
= *pnum_locks
;
145 DEBUG(10, ("g_lock_cleanup: %d locks\n", num_locks
));
147 for (i
=0; i
<num_locks
; i
++) {
148 if (process_exists(locks
[i
].pid
)) {
151 DEBUGADD(10, ("%s does not exist -- discarding\n",
152 server_id_str(talloc_tos(), &locks
[i
].pid
)));
154 if (i
< (num_locks
-1)) {
155 locks
[i
] = locks
[num_locks
-1];
159 *pnum_locks
= num_locks
;
163 static struct g_lock_rec
*g_lock_addrec(TALLOC_CTX
*mem_ctx
,
164 struct g_lock_rec
*locks
,
166 const struct server_id pid
,
167 enum g_lock_type lock_type
)
169 struct g_lock_rec
*result
;
170 int num_locks
= *pnum_locks
;
172 result
= talloc_realloc(mem_ctx
, locks
, struct g_lock_rec
,
174 if (result
== NULL
) {
178 result
[num_locks
].pid
= pid
;
179 result
[num_locks
].lock_type
= lock_type
;
184 static void g_lock_got_retry(struct messaging_context
*msg
,
187 struct server_id server_id
,
190 static NTSTATUS
g_lock_trylock(struct g_lock_ctx
*ctx
, const char *name
,
191 enum g_lock_type lock_type
)
193 struct db_record
*rec
= NULL
;
194 struct g_lock_rec
*locks
= NULL
;
196 struct server_id self
;
199 NTSTATUS status
= NT_STATUS_OK
;
200 NTSTATUS store_status
;
204 rec
= dbwrap_fetch_locked(ctx
->db
, talloc_tos(),
205 string_term_tdb_data(name
));
207 DEBUG(10, ("fetch_locked(\"%s\") failed\n", name
));
208 status
= NT_STATUS_LOCK_NOT_GRANTED
;
212 value
= dbwrap_record_get_value(rec
);
213 if (!g_lock_parse(talloc_tos(), value
, &num_locks
, &locks
)) {
214 DEBUG(10, ("g_lock_parse for %s failed\n", name
));
215 status
= NT_STATUS_INTERNAL_ERROR
;
219 self
= messaging_server_id(ctx
->msg
);
222 for (i
=0; i
<num_locks
; i
++) {
223 if (procid_equal(&self
, &locks
[i
].pid
)) {
224 if (our_index
!= -1) {
225 DEBUG(1, ("g_lock_trylock: Added ourself "
227 status
= NT_STATUS_INTERNAL_ERROR
;
230 if ((locks
[i
].lock_type
& G_LOCK_PENDING
) == 0) {
231 DEBUG(1, ("g_lock_trylock: Found ourself not "
233 status
= NT_STATUS_INTERNAL_ERROR
;
239 /* never conflict with ourself */
242 if (g_lock_conflicts(lock_type
, &locks
[i
])) {
243 struct server_id pid
= locks
[i
].pid
;
245 if (!process_exists(pid
)) {
248 status
= g_lock_force_unlock(ctx
, name
, pid
);
249 if (!NT_STATUS_IS_OK(status
)) {
250 DEBUG(1, ("Could not unlock dead lock "
256 lock_type
|= G_LOCK_PENDING
;
260 if (our_index
== -1) {
261 /* First round, add ourself */
263 locks
= g_lock_addrec(talloc_tos(), locks
, &num_locks
,
266 DEBUG(10, ("g_lock_addrec failed\n"));
267 status
= NT_STATUS_NO_MEMORY
;
272 * Retry. We were pending last time. Overwrite the
273 * stored lock_type with what we calculated, we might
274 * have acquired the lock this time.
276 locks
[our_index
].lock_type
= lock_type
;
279 if (NT_STATUS_IS_OK(status
) && ((lock_type
& G_LOCK_PENDING
) == 0)) {
281 * Walk through the list of locks, search for dead entries
283 g_lock_cleanup(&num_locks
, locks
);
286 data
= make_tdb_data((uint8_t *)locks
, num_locks
* sizeof(*locks
));
287 store_status
= dbwrap_record_store(rec
, data
, 0);
288 if (!NT_STATUS_IS_OK(store_status
)) {
289 DEBUG(1, ("rec->store failed: %s\n",
290 nt_errstr(store_status
)));
291 status
= store_status
;
298 if (NT_STATUS_IS_OK(status
) && (lock_type
& G_LOCK_PENDING
) != 0) {
299 return STATUS_PENDING
;
305 NTSTATUS
g_lock_lock(struct g_lock_ctx
*ctx
, const char *name
,
306 enum g_lock_type lock_type
, struct timeval timeout
)
308 struct tevent_timer
*te
= NULL
;
311 struct timeval timeout_end
;
312 struct timeval time_now
;
314 DEBUG(10, ("Trying to acquire lock %d for %s\n", (int)lock_type
,
317 if (lock_type
& ~1) {
318 DEBUG(1, ("Got invalid lock type %d for %s\n",
319 (int)lock_type
, name
));
320 return NT_STATUS_INVALID_PARAMETER
;
323 #ifdef CLUSTER_SUPPORT
324 if (lp_clustering()) {
325 status
= ctdb_watch_us(messaging_ctdbd_connection());
326 if (!NT_STATUS_IS_OK(status
)) {
327 DEBUG(10, ("could not register retry with ctdb: %s\n",
334 status
= messaging_register(ctx
->msg
, &retry
, MSG_DBWRAP_G_LOCK_RETRY
,
336 if (!NT_STATUS_IS_OK(status
)) {
337 DEBUG(10, ("messaging_register failed: %s\n",
342 time_now
= timeval_current();
343 timeout_end
= timeval_sum(&time_now
, &timeout
);
346 struct pollfd
*pollfds
;
350 struct timeval timeout_remaining
, select_timeout
;
352 status
= g_lock_trylock(ctx
, name
, lock_type
);
353 if (NT_STATUS_IS_OK(status
)) {
354 DEBUG(10, ("Got lock %s\n", name
));
357 if (!NT_STATUS_EQUAL(status
, STATUS_PENDING
)) {
358 DEBUG(10, ("g_lock_trylock failed: %s\n",
363 DEBUG(10, ("g_lock_trylock: Did not get lock, waiting...\n"));
365 /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
366 * !!! HACK ALERT --- FIX ME !!!
367 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
368 * What we really want to do here is to react to
369 * MSG_DBWRAP_G_LOCK_RETRY messages that are either sent
370 * by a client doing g_lock_unlock or by ourselves when
371 * we receive a CTDB_SRVID_SAMBA_NOTIFY or
372 * CTDB_SRVID_RECONFIGURE message from ctdbd, i.e. when
373 * either a client holding a lock or a complete node
376 * Doing this properly involves calling tevent_loop_once(),
377 * but doing this here with the main ctdbd messaging context
378 * creates a nested event loop when g_lock_lock() is called
379 * from the main event loop, e.g. in a tcon_and_X where the
380 * share_info.tdb needs to be initialized and is locked by
381 * another process, or when the remore registry is accessed
382 * for writing and some other process already holds a lock
383 * on the registry.tdb.
385 * So as a quick fix, we act a little coarsely here: we do
386 * a select on the ctdb connection fd and when it is readable
387 * or we get EINTR, then we retry without actually parsing
388 * any ctdb packages or dispatching messages. This means that
389 * we retry more often than intended by design, but this does
390 * not harm and it is unobtrusive. When we have finished,
391 * the main loop will pick up all the messages and ctdb
392 * packets. The only extra twist is that we cannot use timed
393 * events here but have to handcode a timeout.
397 * We allocate 2 entries here. One is needed anyway for
398 * sys_poll and in the clustering case we might have to add
399 * the ctdb fd. This avoids the realloc then.
401 pollfds
= talloc_array(talloc_tos(), struct pollfd
, 2);
402 if (pollfds
== NULL
) {
403 status
= NT_STATUS_NO_MEMORY
;
408 #ifdef CLUSTER_SUPPORT
409 if (lp_clustering()) {
410 struct ctdbd_connection
*conn
;
411 conn
= messaging_ctdbd_connection();
413 pollfds
[0].fd
= ctdbd_conn_get_fd(conn
);
414 pollfds
[0].events
= POLLIN
|POLLHUP
;
420 time_now
= timeval_current();
421 timeout_remaining
= timeval_until(&time_now
, &timeout_end
);
422 select_timeout
= timeval_set(60, 0);
424 select_timeout
= timeval_min(&select_timeout
,
427 ret
= sys_poll(pollfds
, num_pollfds
,
428 timeval_to_msec(select_timeout
));
431 * We're not *really interested in the actual flags. We just
432 * need to retry this whole thing.
435 TALLOC_FREE(pollfds
);
439 if (errno
!= EINTR
) {
440 DEBUG(1, ("error calling select: %s\n",
442 status
= NT_STATUS_INTERNAL_ERROR
;
447 * This means a signal was received.
448 * It might have been a MSG_DBWRAP_G_LOCK_RETRY message.
451 } else if (ret
== 0) {
452 if (timeval_expired(&timeout_end
)) {
453 DEBUG(10, ("g_lock_lock timed out\n"));
454 status
= NT_STATUS_LOCK_NOT_GRANTED
;
457 DEBUG(10, ("select returned 0 but timeout not "
458 "not expired, retrying\n"));
460 } else if (ret
!= 1) {
461 DEBUG(1, ("invalid return code of select: %d\n", ret
));
462 status
= NT_STATUS_INTERNAL_ERROR
;
467 * This means ctdbd has sent us some data.
468 * Might be a CTDB_SRVID_RECONFIGURE or a
469 * CTDB_SRVID_SAMBA_NOTIFY message.
474 #ifdef CLUSTER_SUPPORT
478 if (!NT_STATUS_IS_OK(status
)) {
479 NTSTATUS unlock_status
;
481 unlock_status
= g_lock_unlock(ctx
, name
);
483 if (!NT_STATUS_IS_OK(unlock_status
)) {
484 DEBUG(1, ("Could not remove ourself from the locking "
485 "db: %s\n", nt_errstr(status
)));
489 messaging_deregister(ctx
->msg
, MSG_DBWRAP_G_LOCK_RETRY
, &retry
);
495 static void g_lock_got_retry(struct messaging_context
*msg
,
498 struct server_id server_id
,
501 bool *pretry
= (bool *)private_data
;
503 DEBUG(10, ("Got retry message from pid %s\n",
504 server_id_str(talloc_tos(), &server_id
)));
509 static NTSTATUS
g_lock_force_unlock(struct g_lock_ctx
*ctx
, const char *name
,
510 struct server_id pid
)
512 struct db_record
*rec
= NULL
;
513 struct g_lock_rec
*locks
= NULL
;
515 enum g_lock_type lock_type
;
519 rec
= dbwrap_fetch_locked(ctx
->db
, talloc_tos(),
520 string_term_tdb_data(name
));
522 DEBUG(10, ("fetch_locked(\"%s\") failed\n", name
));
523 status
= NT_STATUS_INTERNAL_ERROR
;
527 value
= dbwrap_record_get_value(rec
);
529 if (!g_lock_parse(talloc_tos(), value
, &num_locks
, &locks
)) {
530 DEBUG(10, ("g_lock_parse for %s failed\n", name
));
531 status
= NT_STATUS_FILE_INVALID
;
535 for (i
=0; i
<num_locks
; i
++) {
536 if (procid_equal(&pid
, &locks
[i
].pid
)) {
541 if (i
== num_locks
) {
542 DEBUG(10, ("g_lock_force_unlock: Lock not found\n"));
543 status
= NT_STATUS_NOT_FOUND
;
547 lock_type
= locks
[i
].lock_type
;
549 if (i
< (num_locks
-1)) {
550 locks
[i
] = locks
[num_locks
-1];
554 if (num_locks
== 0) {
555 status
= dbwrap_record_delete(rec
);
558 data
= make_tdb_data((uint8_t *)locks
,
559 sizeof(struct g_lock_rec
) * num_locks
);
560 status
= dbwrap_record_store(rec
, data
, 0);
563 if (!NT_STATUS_IS_OK(status
)) {
564 DEBUG(1, ("g_lock_force_unlock: Could not store record: %s\n",
571 if ((lock_type
& G_LOCK_PENDING
) == 0) {
575 * We've been the lock holder. Others to retry. Don't
576 * tell all others to avoid a thundering herd. In case
577 * this leads to a complete stall because we miss some
578 * processes, the loop in g_lock_lock tries at least
582 for (i
=0; i
<num_locks
; i
++) {
583 if ((locks
[i
].lock_type
& G_LOCK_PENDING
) == 0) {
586 if (!process_exists(locks
[i
].pid
)) {
591 * Ping all waiters to retry
593 status
= messaging_send(ctx
->msg
, locks
[i
].pid
,
594 MSG_DBWRAP_G_LOCK_RETRY
,
596 if (!NT_STATUS_IS_OK(status
)) {
597 DEBUG(1, ("sending retry to %s failed: %s\n",
598 server_id_str(talloc_tos(),
604 if (num_wakeups
> 5) {
611 * For the error path, TALLOC_FREE(rec) as well. In the good
612 * path we have already freed it.
620 NTSTATUS
g_lock_unlock(struct g_lock_ctx
*ctx
, const char *name
)
624 status
= g_lock_force_unlock(ctx
, name
, messaging_server_id(ctx
->msg
));
626 #ifdef CLUSTER_SUPPORT
627 if (lp_clustering()) {
628 ctdb_unwatch(messaging_ctdbd_connection());
634 struct g_lock_locks_state
{
635 int (*fn
)(const char *name
, void *private_data
);
639 static int g_lock_locks_fn(struct db_record
*rec
, void *priv
)
642 struct g_lock_locks_state
*state
= (struct g_lock_locks_state
*)priv
;
644 key
= dbwrap_record_get_key(rec
);
645 if ((key
.dsize
== 0) || (key
.dptr
[key
.dsize
-1] != 0)) {
646 DEBUG(1, ("invalid key in g_lock.tdb, ignoring\n"));
649 return state
->fn((char *)key
.dptr
, state
->private_data
);
652 int g_lock_locks(struct g_lock_ctx
*ctx
,
653 int (*fn
)(const char *name
, void *private_data
),
656 struct g_lock_locks_state state
;
661 state
.private_data
= private_data
;
663 status
= dbwrap_traverse_read(ctx
->db
, g_lock_locks_fn
, &state
, &count
);
664 if (!NT_STATUS_IS_OK(status
)) {
671 NTSTATUS
g_lock_dump(struct g_lock_ctx
*ctx
, const char *name
,
672 int (*fn
)(struct server_id pid
,
673 enum g_lock_type lock_type
,
679 struct g_lock_rec
*locks
= NULL
;
683 status
= dbwrap_fetch_bystring(ctx
->db
, talloc_tos(), name
, &data
);
684 if (!NT_STATUS_IS_OK(status
)) {
688 if ((data
.dsize
== 0) || (data
.dptr
== NULL
)) {
692 ret
= g_lock_parse(talloc_tos(), data
, &num_locks
, &locks
);
694 TALLOC_FREE(data
.dptr
);
697 DEBUG(10, ("g_lock_parse for %s failed\n", name
));
698 return NT_STATUS_INTERNAL_ERROR
;
701 for (i
=0; i
<num_locks
; i
++) {
702 if (fn(locks
[i
].pid
, locks
[i
].lock_type
, private_data
) != 0) {
710 struct g_lock_get_state
{
712 struct server_id
*pid
;
715 static int g_lock_get_fn(struct server_id pid
, enum g_lock_type lock_type
,
718 struct g_lock_get_state
*state
= (struct g_lock_get_state
*)priv
;
720 if ((lock_type
& G_LOCK_PENDING
) != 0) {
729 NTSTATUS
g_lock_get(struct g_lock_ctx
*ctx
, const char *name
,
730 struct server_id
*pid
)
732 struct g_lock_get_state state
;
738 status
= g_lock_dump(ctx
, name
, g_lock_get_fn
, &state
);
739 if (!NT_STATUS_IS_OK(status
)) {
743 return NT_STATUS_NOT_FOUND
;
748 static bool g_lock_init_all(TALLOC_CTX
*mem_ctx
,
749 struct tevent_context
**pev
,
750 struct messaging_context
**pmsg
,
751 const struct server_id self
,
752 struct g_lock_ctx
**pg_ctx
)
754 struct tevent_context
*ev
= NULL
;
755 struct messaging_context
*msg
= NULL
;
756 struct g_lock_ctx
*g_ctx
= NULL
;
758 ev
= tevent_context_init(mem_ctx
);
760 d_fprintf(stderr
, "ERROR: could not init event context\n");
763 msg
= messaging_init(mem_ctx
, self
, ev
);
765 d_fprintf(stderr
, "ERROR: could not init messaging context\n");
768 g_ctx
= g_lock_ctx_init(mem_ctx
, msg
);
770 d_fprintf(stderr
, "ERROR: could not init g_lock context\n");
785 NTSTATUS
g_lock_do(const char *name
, enum g_lock_type lock_type
,
786 struct timeval timeout
, const struct server_id self
,
787 void (*fn
)(void *private_data
), void *private_data
)
789 struct tevent_context
*ev
= NULL
;
790 struct messaging_context
*msg
= NULL
;
791 struct g_lock_ctx
*g_ctx
= NULL
;
794 if (!g_lock_init_all(talloc_tos(), &ev
, &msg
, self
, &g_ctx
)) {
795 status
= NT_STATUS_ACCESS_DENIED
;
799 status
= g_lock_lock(g_ctx
, name
, lock_type
, timeout
);
800 if (!NT_STATUS_IS_OK(status
)) {
804 g_lock_unlock(g_ctx
, name
);