3 provide API to do non-blocking locks for single or all databases
5 Copyright (C) Amitay Isaacs 2012
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3 of the License, or
10 (at your option) any later version.
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, see <http://www.gnu.org/licenses/>.
21 #include "system/filesys.h"
22 #include "system/network.h"
27 #include "lib/tdb_wrap/tdb_wrap.h"
28 #include "lib/util/dlinklist.h"
29 #include "lib/util/debug.h"
30 #include "lib/util/samba_util.h"
31 #include "lib/util/sys_rw.h"
33 #include "ctdb_private.h"
35 #include "common/common.h"
36 #include "common/logging.h"
39 * Non-blocking Locking API
41 * 1. Create a child process to do blocking locks.
42 * 2. Once the locks are obtained, signal parent process via fd.
43 * 3. Invoke registered callback routine with locking status.
44 * 4. If the child process cannot get locks within certain time,
45 * execute an external script to debug.
47 * ctdb_lock_record() - get a lock on a record
48 * ctdb_lock_db() - get a lock on a DB
50 * auto_mark - whether to mark/unmark DBs in before/after callback
51 * = false is used for freezing databases for
52 * recovery since the recovery cannot start till
53 * databases are locked on all the nodes.
54 * = true is used for record locks.
62 static const char * const lock_type_str
[] = {
69 /* lock_context is the common part for a lock request */
71 struct lock_context
*next
, *prev
;
73 struct ctdb_context
*ctdb
;
74 struct ctdb_db_context
*ctdb_db
;
78 struct lock_request
*request
;
81 struct tevent_fd
*tfd
;
82 struct tevent_timer
*ttimer
;
83 struct timeval start_time
;
88 /* lock_request is the client specific part for a lock request */
90 struct lock_context
*lctx
;
91 void (*callback
)(void *, bool);
96 int ctdb_db_iterator(struct ctdb_context
*ctdb
, ctdb_db_handler_t handler
,
99 struct ctdb_db_context
*ctdb_db
;
102 for (ctdb_db
= ctdb
->db_list
; ctdb_db
; ctdb_db
= ctdb_db
->next
) {
103 ret
= handler(ctdb_db
, private_data
);
113 * lock all databases - mark only
115 static int db_lock_mark_handler(struct ctdb_db_context
*ctdb_db
,
118 int tdb_transaction_write_lock_mark(struct tdb_context
*);
120 DEBUG(DEBUG_INFO
, ("marking locked database %s\n", ctdb_db
->db_name
));
122 if (tdb_transaction_write_lock_mark(ctdb_db
->ltdb
->tdb
) != 0) {
123 DEBUG(DEBUG_ERR
, ("Failed to mark (transaction lock) database %s\n",
128 if (tdb_lockall_mark(ctdb_db
->ltdb
->tdb
) != 0) {
129 DEBUG(DEBUG_ERR
, ("Failed to mark (all lock) database %s\n",
137 int ctdb_lockdb_mark(struct ctdb_db_context
*ctdb_db
)
139 if (!ctdb_db_frozen(ctdb_db
)) {
141 ("Attempt to mark database locked when not frozen\n"));
145 return db_lock_mark_handler(ctdb_db
, NULL
);
149 * lock all databases - unmark only
151 static int db_lock_unmark_handler(struct ctdb_db_context
*ctdb_db
,
154 int tdb_transaction_write_lock_unmark(struct tdb_context
*);
156 DEBUG(DEBUG_INFO
, ("unmarking locked database %s\n", ctdb_db
->db_name
));
158 if (tdb_transaction_write_lock_unmark(ctdb_db
->ltdb
->tdb
) != 0) {
159 DEBUG(DEBUG_ERR
, ("Failed to unmark (transaction lock) database %s\n",
164 if (tdb_lockall_unmark(ctdb_db
->ltdb
->tdb
) != 0) {
165 DEBUG(DEBUG_ERR
, ("Failed to unmark (all lock) database %s\n",
173 int ctdb_lockdb_unmark(struct ctdb_db_context
*ctdb_db
)
175 if (!ctdb_db_frozen(ctdb_db
)) {
177 ("Attempt to unmark database locked when not frozen\n"));
181 return db_lock_unmark_handler(ctdb_db
, NULL
);
184 static void ctdb_lock_schedule(struct ctdb_context
*ctdb
);
187 * Destructor to kill the child locking process
189 static int ctdb_lock_context_destructor(struct lock_context
*lock_ctx
)
191 if (lock_ctx
->request
) {
192 lock_ctx
->request
->lctx
= NULL
;
194 if (lock_ctx
->child
> 0) {
195 ctdb_kill(lock_ctx
->ctdb
, lock_ctx
->child
, SIGTERM
);
196 if (lock_ctx
->type
== LOCK_RECORD
) {
197 DLIST_REMOVE(lock_ctx
->ctdb_db
->lock_current
, lock_ctx
);
199 DLIST_REMOVE(lock_ctx
->ctdb
->lock_current
, lock_ctx
);
201 if (lock_ctx
->ctdb_db
) {
202 lock_ctx
->ctdb_db
->lock_num_current
--;
204 CTDB_DECREMENT_STAT(lock_ctx
->ctdb
, locks
.num_current
);
205 if (lock_ctx
->ctdb_db
) {
206 CTDB_DECREMENT_DB_STAT(lock_ctx
->ctdb_db
, locks
.num_current
);
209 if (lock_ctx
->type
== LOCK_RECORD
) {
210 DLIST_REMOVE(lock_ctx
->ctdb_db
->lock_pending
, lock_ctx
);
212 DLIST_REMOVE(lock_ctx
->ctdb
->lock_pending
, lock_ctx
);
214 CTDB_DECREMENT_STAT(lock_ctx
->ctdb
, locks
.num_pending
);
215 if (lock_ctx
->ctdb_db
) {
216 CTDB_DECREMENT_DB_STAT(lock_ctx
->ctdb_db
, locks
.num_pending
);
220 ctdb_lock_schedule(lock_ctx
->ctdb
);
227 * Destructor to remove lock request
229 static int ctdb_lock_request_destructor(struct lock_request
*lock_request
)
231 if (lock_request
->lctx
== NULL
) {
235 lock_request
->lctx
->request
= NULL
;
236 TALLOC_FREE(lock_request
->lctx
);
242 * Process all the callbacks waiting for lock
244 * If lock has failed, callback is executed with locked=false
246 static void process_callbacks(struct lock_context
*lock_ctx
, bool locked
)
248 struct lock_request
*request
;
249 bool auto_mark
= lock_ctx
->auto_mark
;
251 if (auto_mark
&& locked
) {
252 switch (lock_ctx
->type
) {
254 tdb_chainlock_mark(lock_ctx
->ctdb_db
->ltdb
->tdb
, lock_ctx
->key
);
258 ctdb_lockdb_mark(lock_ctx
->ctdb_db
);
263 request
= lock_ctx
->request
;
265 /* Since request may be freed in the callback, unset the lock
266 * context, so request destructor will not free lock context.
268 request
->lctx
= NULL
;
271 /* Since request may be freed in the callback, unset the request */
272 lock_ctx
->request
= NULL
;
274 request
->callback(request
->private_data
, locked
);
281 switch (lock_ctx
->type
) {
283 tdb_chainlock_unmark(lock_ctx
->ctdb_db
->ltdb
->tdb
, lock_ctx
->key
);
287 ctdb_lockdb_unmark(lock_ctx
->ctdb_db
);
292 talloc_free(lock_ctx
);
296 static int lock_bucket_id(double t
)
298 double ms
= 1.e
-3, s
= 1;
303 } else if (t
< 10*ms
) {
305 } else if (t
< 100*ms
) {
307 } else if (t
< 1*s
) {
309 } else if (t
< 2*s
) {
311 } else if (t
< 4*s
) {
313 } else if (t
< 8*s
) {
315 } else if (t
< 16*s
) {
317 } else if (t
< 32*s
) {
319 } else if (t
< 64*s
) {
329 * Callback routine when the required locks are obtained.
330 * Called from parent context
332 static void ctdb_lock_handler(struct tevent_context
*ev
,
333 struct tevent_fd
*tfd
,
337 struct lock_context
*lock_ctx
;
343 lock_ctx
= talloc_get_type_abort(private_data
, struct lock_context
);
345 /* cancel the timeout event */
346 TALLOC_FREE(lock_ctx
->ttimer
);
348 t
= timeval_elapsed(&lock_ctx
->start_time
);
349 id
= lock_bucket_id(t
);
351 /* Read the status from the child process */
352 if (sys_read(lock_ctx
->fd
[0], &c
, 1) != 1) {
355 locked
= (c
== 0 ? true : false);
358 /* Update statistics */
359 CTDB_INCREMENT_STAT(lock_ctx
->ctdb
, locks
.num_calls
);
360 if (lock_ctx
->ctdb_db
) {
361 CTDB_INCREMENT_DB_STAT(lock_ctx
->ctdb_db
, locks
.num_calls
);
365 if (lock_ctx
->ctdb_db
) {
366 CTDB_INCREMENT_STAT(lock_ctx
->ctdb
, locks
.buckets
[id
]);
367 CTDB_UPDATE_LATENCY(lock_ctx
->ctdb
, lock_ctx
->ctdb_db
,
368 lock_type_str
[lock_ctx
->type
], locks
.latency
,
369 lock_ctx
->start_time
);
371 CTDB_UPDATE_DB_LATENCY(lock_ctx
->ctdb_db
, lock_type_str
[lock_ctx
->type
], locks
.latency
, t
);
372 CTDB_INCREMENT_DB_STAT(lock_ctx
->ctdb_db
, locks
.buckets
[id
]);
375 CTDB_INCREMENT_STAT(lock_ctx
->ctdb
, locks
.num_failed
);
376 if (lock_ctx
->ctdb_db
) {
377 CTDB_INCREMENT_DB_STAT(lock_ctx
->ctdb_db
, locks
.num_failed
);
381 process_callbacks(lock_ctx
, locked
);
386 * Callback routine when required locks are not obtained within timeout
387 * Called from parent context
389 static void ctdb_lock_timeout_handler(struct tevent_context
*ev
,
390 struct tevent_timer
*ttimer
,
391 struct timeval current_time
,
394 static char debug_locks
[PATH_MAX
+1] = "";
395 static struct timeval last_debug_time
;
396 struct lock_context
*lock_ctx
;
397 struct ctdb_context
*ctdb
;
403 lock_ctx
= talloc_get_type_abort(private_data
, struct lock_context
);
404 ctdb
= lock_ctx
->ctdb
;
406 elapsed_time
= timeval_elapsed(&lock_ctx
->start_time
);
407 if (lock_ctx
->ctdb_db
) {
409 ("Unable to get %s lock on database %s for %.0lf seconds\n",
410 (lock_ctx
->type
== LOCK_RECORD
? "RECORD" : "DB"),
411 lock_ctx
->ctdb_db
->db_name
, elapsed_time
));
414 ("Unable to get ALLDB locks for %.0lf seconds\n",
418 /* If a node stopped/banned, don't spam the logs */
419 if (ctdb
->nodes
[ctdb
->pnn
]->flags
& NODE_FLAGS_INACTIVE
) {
420 goto skip_lock_debug
;
423 /* Restrict log debugging to once per second */
424 now
= timeval_current();
425 if (last_debug_time
.tv_sec
== now
.tv_sec
) {
426 goto skip_lock_debug
;
429 last_debug_time
.tv_sec
= now
.tv_sec
;
431 if (ctdb_set_helper("lock debugging helper",
432 debug_locks
, sizeof(debug_locks
),
434 getenv("CTDB_BASE"), "debug_locks.sh")) {
437 execl(debug_locks
, debug_locks
, NULL
);
440 ctdb_track_child(ctdb
, pid
);
444 " Unable to setup lock debugging\n"));
449 /* Back-off logging if lock is not obtained for a long time */
450 if (elapsed_time
< 100.0) {
452 } else if (elapsed_time
< 1000.0) {
458 /* reset the timeout timer */
459 // talloc_free(lock_ctx->ttimer);
460 lock_ctx
->ttimer
= tevent_add_timer(ctdb
->ev
,
462 timeval_current_ofs(new_timer
, 0),
463 ctdb_lock_timeout_handler
,
467 static int db_flags(struct ctdb_db_context
*ctdb_db
)
469 int tdb_flags
= TDB_DEFAULT
;
471 #ifdef TDB_MUTEX_LOCKING
472 if (!ctdb_db
->persistent
&& ctdb_db
->ctdb
->tunable
.mutex_enabled
) {
473 tdb_flags
= (TDB_MUTEX_LOCKING
| TDB_CLEAR_IF_FIRST
);
479 static bool lock_helper_args(TALLOC_CTX
*mem_ctx
,
480 struct lock_context
*lock_ctx
, int fd
,
481 int *argc
, const char ***argv
)
483 const char **args
= NULL
;
486 switch (lock_ctx
->type
) {
496 /* Add extra argument for null termination */
499 args
= talloc_array(mem_ctx
, const char *, nargs
);
504 args
[0] = talloc_asprintf(args
, "%d", getpid());
505 args
[1] = talloc_asprintf(args
, "%d", fd
);
507 switch (lock_ctx
->type
) {
509 args
[2] = talloc_strdup(args
, "RECORD");
510 args
[3] = talloc_strdup(args
, lock_ctx
->ctdb_db
->db_path
);
511 args
[4] = talloc_asprintf(args
, "0x%x",
512 db_flags(lock_ctx
->ctdb_db
));
513 if (lock_ctx
->key
.dsize
== 0) {
514 args
[5] = talloc_strdup(args
, "NULL");
516 args
[5] = hex_encode_talloc(args
, lock_ctx
->key
.dptr
, lock_ctx
->key
.dsize
);
521 args
[2] = talloc_strdup(args
, "DB");
522 args
[3] = talloc_strdup(args
, lock_ctx
->ctdb_db
->db_path
);
523 args
[4] = talloc_asprintf(args
, "0x%x",
524 db_flags(lock_ctx
->ctdb_db
));
528 /* Make sure last argument is NULL */
529 args
[nargs
-1] = NULL
;
531 for (i
=0; i
<nargs
-1; i
++) {
532 if (args
[i
] == NULL
) {
544 * Find a lock request that can be scheduled
546 static struct lock_context
*ctdb_find_lock_context(struct ctdb_context
*ctdb
)
548 struct lock_context
*lock_ctx
, *next_ctx
;
549 struct ctdb_db_context
*ctdb_db
;
551 /* First check if there are database lock requests */
553 for (lock_ctx
= ctdb
->lock_pending
; lock_ctx
!= NULL
;
554 lock_ctx
= next_ctx
) {
556 if (lock_ctx
->request
!= NULL
) {
557 /* Found a lock context with a request */
561 next_ctx
= lock_ctx
->next
;
563 DEBUG(DEBUG_INFO
, ("Removing lock context without lock "
565 DLIST_REMOVE(ctdb
->lock_pending
, lock_ctx
);
566 CTDB_DECREMENT_STAT(ctdb
, locks
.num_pending
);
567 if (lock_ctx
->ctdb_db
) {
568 CTDB_DECREMENT_DB_STAT(lock_ctx
->ctdb_db
,
571 talloc_free(lock_ctx
);
574 /* Next check database queues */
575 for (ctdb_db
= ctdb
->db_list
; ctdb_db
; ctdb_db
= ctdb_db
->next
) {
576 if (ctdb_db
->lock_num_current
==
577 ctdb
->tunable
.lock_processes_per_db
) {
581 for (lock_ctx
= ctdb_db
->lock_pending
; lock_ctx
!= NULL
;
582 lock_ctx
= next_ctx
) {
584 next_ctx
= lock_ctx
->next
;
586 if (lock_ctx
->request
!= NULL
) {
590 DEBUG(DEBUG_INFO
, ("Removing lock context without "
592 DLIST_REMOVE(ctdb_db
->lock_pending
, lock_ctx
);
593 CTDB_DECREMENT_STAT(ctdb
, locks
.num_pending
);
594 CTDB_DECREMENT_DB_STAT(ctdb_db
, locks
.num_pending
);
595 talloc_free(lock_ctx
);
603 * Schedule a new lock child process
604 * Set up callback handler and timeout handler
606 static void ctdb_lock_schedule(struct ctdb_context
*ctdb
)
608 struct lock_context
*lock_ctx
;
611 static char prog
[PATH_MAX
+1] = "";
614 if (!ctdb_set_helper("lock helper",
617 CTDB_HELPER_BINDIR
, "ctdb_lock_helper")) {
618 ctdb_die(ctdb
, __location__
619 " Unable to set lock helper\n");
622 /* Find a lock context with requests */
623 lock_ctx
= ctdb_find_lock_context(ctdb
);
624 if (lock_ctx
== NULL
) {
628 lock_ctx
->child
= -1;
629 ret
= pipe(lock_ctx
->fd
);
631 DEBUG(DEBUG_ERR
, ("Failed to create pipe in ctdb_lock_schedule\n"));
635 set_close_on_exec(lock_ctx
->fd
[0]);
637 /* Create data for child process */
638 tmp_ctx
= talloc_new(lock_ctx
);
639 if (tmp_ctx
== NULL
) {
640 DEBUG(DEBUG_ERR
, ("Failed to allocate memory for helper args\n"));
641 close(lock_ctx
->fd
[0]);
642 close(lock_ctx
->fd
[1]);
646 if (! ctdb
->do_setsched
) {
647 ret
= setenv("CTDB_NOSETSCHED", "1", 1);
650 ("Failed to set CTDB_NOSETSCHED variable\n"));
654 /* Create arguments for lock helper */
655 if (!lock_helper_args(tmp_ctx
, lock_ctx
, lock_ctx
->fd
[1],
657 DEBUG(DEBUG_ERR
, ("Failed to create lock helper args\n"));
658 close(lock_ctx
->fd
[0]);
659 close(lock_ctx
->fd
[1]);
660 talloc_free(tmp_ctx
);
664 lock_ctx
->child
= ctdb_vfork_exec(lock_ctx
, ctdb
, prog
, argc
,
665 (const char **)args
);
666 if (lock_ctx
->child
== -1) {
667 DEBUG(DEBUG_ERR
, ("Failed to create a child in ctdb_lock_schedule\n"));
668 close(lock_ctx
->fd
[0]);
669 close(lock_ctx
->fd
[1]);
670 talloc_free(tmp_ctx
);
675 close(lock_ctx
->fd
[1]);
677 talloc_free(tmp_ctx
);
679 /* Set up timeout handler */
680 lock_ctx
->ttimer
= tevent_add_timer(ctdb
->ev
,
682 timeval_current_ofs(10, 0),
683 ctdb_lock_timeout_handler
,
685 if (lock_ctx
->ttimer
== NULL
) {
686 ctdb_kill(ctdb
, lock_ctx
->child
, SIGTERM
);
687 lock_ctx
->child
= -1;
688 close(lock_ctx
->fd
[0]);
692 /* Set up callback */
693 lock_ctx
->tfd
= tevent_add_fd(ctdb
->ev
,
699 if (lock_ctx
->tfd
== NULL
) {
700 TALLOC_FREE(lock_ctx
->ttimer
);
701 ctdb_kill(ctdb
, lock_ctx
->child
, SIGTERM
);
702 lock_ctx
->child
= -1;
703 close(lock_ctx
->fd
[0]);
706 tevent_fd_set_auto_close(lock_ctx
->tfd
);
708 /* Move the context from pending to current */
709 if (lock_ctx
->type
== LOCK_RECORD
) {
710 DLIST_REMOVE(lock_ctx
->ctdb_db
->lock_pending
, lock_ctx
);
711 DLIST_ADD_END(lock_ctx
->ctdb_db
->lock_current
, lock_ctx
);
713 DLIST_REMOVE(ctdb
->lock_pending
, lock_ctx
);
714 DLIST_ADD_END(ctdb
->lock_current
, lock_ctx
);
716 CTDB_DECREMENT_STAT(lock_ctx
->ctdb
, locks
.num_pending
);
717 CTDB_INCREMENT_STAT(lock_ctx
->ctdb
, locks
.num_current
);
718 if (lock_ctx
->ctdb_db
) {
719 lock_ctx
->ctdb_db
->lock_num_current
++;
720 CTDB_DECREMENT_DB_STAT(lock_ctx
->ctdb_db
, locks
.num_pending
);
721 CTDB_INCREMENT_DB_STAT(lock_ctx
->ctdb_db
, locks
.num_current
);
727 * Lock record / db depending on type
729 static struct lock_request
*ctdb_lock_internal(TALLOC_CTX
*mem_ctx
,
730 struct ctdb_context
*ctdb
,
731 struct ctdb_db_context
*ctdb_db
,
734 void (*callback
)(void *, bool),
739 struct lock_context
*lock_ctx
= NULL
;
740 struct lock_request
*request
;
742 if (callback
== NULL
) {
743 DEBUG(DEBUG_WARNING
, ("No callback function specified, not locking\n"));
747 lock_ctx
= talloc_zero(ctdb
, struct lock_context
);
748 if (lock_ctx
== NULL
) {
749 DEBUG(DEBUG_ERR
, ("Failed to create a new lock context\n"));
753 if ((request
= talloc_zero(mem_ctx
, struct lock_request
)) == NULL
) {
754 talloc_free(lock_ctx
);
758 lock_ctx
->type
= type
;
759 lock_ctx
->ctdb
= ctdb
;
760 lock_ctx
->ctdb_db
= ctdb_db
;
761 lock_ctx
->key
.dsize
= key
.dsize
;
763 lock_ctx
->key
.dptr
= talloc_memdup(lock_ctx
, key
.dptr
, key
.dsize
);
764 if (lock_ctx
->key
.dptr
== NULL
) {
765 DEBUG(DEBUG_ERR
, (__location__
"Memory allocation error\n"));
766 talloc_free(lock_ctx
);
767 talloc_free(request
);
770 lock_ctx
->key_hash
= ctdb_hash(&key
);
772 lock_ctx
->key
.dptr
= NULL
;
774 lock_ctx
->priority
= priority
;
775 lock_ctx
->auto_mark
= auto_mark
;
777 lock_ctx
->request
= request
;
778 lock_ctx
->child
= -1;
780 /* Non-record locks are required by recovery and should be scheduled
781 * immediately, so keep them at the head of the pending queue.
783 if (lock_ctx
->type
== LOCK_RECORD
) {
784 DLIST_ADD_END(ctdb_db
->lock_pending
, lock_ctx
);
786 DLIST_ADD_END(ctdb
->lock_pending
, lock_ctx
);
788 CTDB_INCREMENT_STAT(ctdb
, locks
.num_pending
);
790 CTDB_INCREMENT_DB_STAT(ctdb_db
, locks
.num_pending
);
793 /* Start the timer when we activate the context */
794 lock_ctx
->start_time
= timeval_current();
796 request
->lctx
= lock_ctx
;
797 request
->callback
= callback
;
798 request
->private_data
= private_data
;
800 talloc_set_destructor(request
, ctdb_lock_request_destructor
);
801 talloc_set_destructor(lock_ctx
, ctdb_lock_context_destructor
);
803 ctdb_lock_schedule(ctdb
);
810 * obtain a lock on a record in a database
812 struct lock_request
*ctdb_lock_record(TALLOC_CTX
*mem_ctx
,
813 struct ctdb_db_context
*ctdb_db
,
816 void (*callback
)(void *, bool),
819 return ctdb_lock_internal(mem_ctx
,
832 * obtain a lock on a database
834 struct lock_request
*ctdb_lock_db(TALLOC_CTX
*mem_ctx
,
835 struct ctdb_db_context
*ctdb_db
,
837 void (*callback
)(void *, bool),
840 return ctdb_lock_internal(mem_ctx
,